@SuppressWarnings( "unused")
public final class Test {
- private final static boolean PERFORM_DB_TESTS = false;
+ private final static boolean PERFORM_DB_TESTS = true;
private final static double ZERO_DIFF = 1.0E-9;
private final static String PATH_TO_TEST_DATA = System.getProperty( "user.dir" )
+ ForesterUtil.getFileSeparator() + "test_data"
System.out.println( "failed." );
failed++;
}
- if ( PERFORM_DB_TESTS ) {
- System.out.print( "Ebi Entry Retrieval: " );
- if ( Test.testEbiEntryRetrieval() ) {
- System.out.println( "OK." );
- succeeded++;
- }
- else {
- System.out.println( "failed." );
- failed++;
- }
- }
- /////////////////////System.exit( 0 );
System.out.print( "UniProtKB id extraction: " );
if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
System.out.println( "OK." );
failed++;
}
if ( PERFORM_DB_TESTS ) {
+ System.out.print( "Ebi Entry Retrieval: " );
+ if ( Test.testEbiEntryRetrieval() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
+ }
+ // System.exit( 0 );
+ if ( PERFORM_DB_TESTS ) {
System.out.print( "Sequence DB tools 2: " );
if ( testSequenceDbWsTools2() ) {
System.out.println( "OK." );
System.exit( -1 );
}
}
+ // System.exit( 0 );
System.out.print( "Hmmscan output parser: " );
if ( testHmmscanOutputParser() ) {
System.out.println( "OK." );
System.out.println( acc.toString() );
return false;
}
+ n.setName( "gi|71845847|1,4-alpha-glucan branching enzyme [Dechloromonas aromatica RCB]" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( ( acc == null ) || !acc.getSource().equals( Source.GI.toString() )
+ || !acc.getValue().equals( "71845847" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "gi|71845847|gb|AAZ45343.1| 1,4-alpha-glucan branching enzyme [Dechloromonas aromatica RCB]" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
+ || !acc.getValue().equals( "AAZ45343.1" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
}
catch ( final Exception e ) {
return false;
}
final PhylogenyNode n2 = new PhylogenyNode( "NM_001030253" );
SequenceDbWsTools.obtainSeqInformation( n2 );
- System.out.println( n2.toString() );
if ( !n2.getNodeData().getSequence().getName()
.equals( "Danio rerio B-cell leukemia/lymphoma 2 (bcl2), mRNA" ) ) {
return false;
}
final PhylogenyNode n3 = new PhylogenyNode( "NM_184234.2" );
SequenceDbWsTools.obtainSeqInformation( n3 );
- System.out.println( "n=" + n3.toString() );
if ( !n3.getNodeData().getSequence().getName()
.equals( "Homo sapiens RNA binding motif protein 39 (RBM39), transcript variant 1, mRNA" ) ) {
return false;
System.out.println( entry4.getGeneName() );
return false;
}
- if ( !entry4.getChromosome().equals( "ras" ) ) {
- System.out.println( entry4.getChromosome() );
- return false;
- }
- if ( !entry4.getMap().equals( "ras" ) ) {
- System.out.println( entry4.getMap() );
- return false;
- }
+ // if ( !entry4.getChromosome().equals( "ras" ) ) {
+ // System.out.println( entry4.getChromosome() );
+ // return false;
+ // }
+ // if ( !entry4.getMap().equals( "ras" ) ) {
+ // System.out.println( entry4.getMap() );
+ // return false;
+ // }
//TODO FIXME gi...
//
//TODO fails:
// if ( !entry5.getAccession().equals( "HM043801" ) ) {
// return false;
// }
+ final SequenceDatabaseEntry entry5 = SequenceDbWsTools.obtainEntry( "AAZ45343.1" );
+ if ( !entry5.getAccession().equals( "AAZ45343" ) ) {
+ return false;
+ }
+ if ( !entry5.getTaxonomyScientificName().equals( "Dechloromonas aromatica RCB" ) ) {
+ System.out.println( entry5.getTaxonomyScientificName() );
+ return false;
+ }
+ if ( !entry5.getSequenceName().equals( "Dechloromonas aromatica RCB 1,4-alpha-glucan branching enzyme" ) ) {
+ System.out.println( entry5.getSequenceName() );
+ return false;
+ }
+ if ( !entry5.getTaxonomyIdentifier().equals( "159087" ) ) {
+ System.out.println( entry5.getTaxonomyIdentifier() );
+ return false;
+ }
}
catch ( final IOException e ) {
System.out.println();
public final class EbiDbEntry implements SequenceDatabaseEntry {
- // public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
- // final EbiDbEntry e = new EbiDbEntry();
- // for( final String line : lines ) {
- // if ( line.startsWith( "PA" ) ) {
- // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
- // }
- // else if ( line.startsWith( "DE" ) ) {
- // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
- // }
- // else if ( line.startsWith( "OS" ) ) {
- // if ( line.indexOf( "(" ) > 0 ) {
- // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
- // }
- // else {
- // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
- // }
- // }
- // else if ( line.startsWith( "OX" ) ) {
- // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
- // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
- // }
- // }
- // }
- // return e;
- // }
- public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
- final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
- final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
- final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
- final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
- final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
- final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
- final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
- final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
- final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
- final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
- final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
- final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
- final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
- final EbiDbEntry e = new EbiDbEntry();
- final StringBuilder def = new StringBuilder();
- boolean in_definition = false;
- boolean in_features = false;
- boolean in_source = false;
- boolean in_gene = false;
- boolean in_cds = false;
- boolean in_mrna = false;
- boolean in_protein = false;
- for( final String line : lines ) {
- if ( line.startsWith( "ACCESSION " ) ) {
- e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
- in_definition = false;
- }
- else if ( line.startsWith( "ID " ) ) {
- e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
- in_definition = false;
- }
- else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
- boolean definiton = false;
- if ( line.startsWith( "DEFINITION " ) ) {
- definiton = true;
- }
- if ( line.indexOf( "[" ) > 0 ) {
- if ( definiton ) {
- x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
- }
- else {
- x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
- }
- }
- else if ( line.indexOf( "." ) > 0 ) {
- if ( definiton ) {
- x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
- }
- else {
- x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
- }
- }
- else {
- if ( definiton ) {
- x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
- }
- else {
- x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
- }
- }
- if ( definiton ) {
- in_definition = true;
- }
- }
- else if ( line.startsWith( " ORGANISM " ) ) {
- if ( line.indexOf( "(" ) > 0 ) {
- e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
- }
- else {
- e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
- }
- // in_def = false;
- }
- else if ( line.startsWith( "OS " ) ) {
- if ( line.indexOf( "(" ) > 0 ) {
- e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
- }
- else {
- e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
- }
- }
- else if ( line.startsWith( " " ) && in_definition ) {
- def.append( " " );
- if ( line.indexOf( "[" ) > 0 ) {
- def.append( SequenceDbWsTools.extractTo( line, "[" ) );
- }
- else if ( line.indexOf( "." ) > 0 ) {
- def.append( SequenceDbWsTools.extractTo( line, "." ) );
- }
- else {
- def.append( line.trim() );
- }
- }
- else {
- in_definition = false;
- }
- if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
- in_features = false;
- in_source = false;
- in_gene = false;
- in_cds = false;
- in_mrna = false;
- in_protein = false;
- // in_def = false;
- }
- if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
- in_features = true;
- }
- if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
- in_source = true;
- in_gene = false;
- in_cds = false;
- in_mrna = false;
- in_protein = false;
- }
- if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
- in_source = false;
- in_gene = true;
- in_cds = false;
- in_mrna = false;
- in_protein = false;
- }
- if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
- in_source = false;
- in_gene = false;
- in_cds = true;
- in_mrna = false;
- in_protein = false;
- }
- if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
- in_source = false;
- in_gene = false;
- in_cds = false;
- in_mrna = false;
- in_protein = true;
- }
- if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
- in_source = false;
- in_gene = false;
- in_cds = false;
- in_mrna = true;
- in_protein = false;
- }
- if ( in_source ) {
- final Matcher ti = taxon_PATTERN.matcher( line );
- if ( ti.find() ) {
- e.setTaxId( ti.group( 1 ) );
- }
- final Matcher chr = chromosome_PATTERN.matcher( line );
- if ( chr.find() ) {
- e.setChromosome( chr.group( 1 ) );
- }
- final Matcher map = map_PATTERN.matcher( line );
- if ( map.find() ) {
- e.setMap( map.group( 1 ) );
- }
- }
- if ( in_cds || in_gene ) {
- final Matcher hgnc = hgnc_PATTERN.matcher( line );
- if ( hgnc.find() ) {
- e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
- }
- final Matcher geneid = geneid_PATTERN.matcher( line );
- if ( geneid.find() ) {
- e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
- }
- }
- if ( in_protein || in_cds || in_gene || in_mrna ) {
- final Matcher ec = ec_PATTERN.matcher( line );
- if ( ec.find() ) {
- e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
- }
- final Matcher gene = gene_PATTERN.matcher( line );
- if ( gene.find() ) {
- e.setGeneName( gene.group( 1 ) );
- }
- final Matcher uniprot = uniprot_PATTERN.matcher( line );
- if ( uniprot.find() ) {
- e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
- }
- final Matcher interpro = interpro_PATTERN.matcher( line );
- if ( interpro.find() ) {
- e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
- }
- final Matcher mim = mim_PATTERN.matcher( line );
- if ( mim.find() ) {
- e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
- }
- final Matcher product = product_PATTERN.matcher( line );
- if ( product.find() ) {
- e.setSequenceSymbol( product.group( 1 ) );
- }
- final Matcher pdb = pdb_PATTERN.matcher( line );
- if ( pdb.find() ) {
- e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
- }
- }
- }
- if ( def.length() > 0 ) {
- e.setSequenceName( def.toString().trim() );
- }
- return e;
- }
- private String _map;
- private String _chromosome;
-
- private void setMap( final String map ) {
- _map = map;
- }
-
- private void setChromosome( final String chromosome ) {
- _chromosome = chromosome;
- }
-
- @Override
- public String getMap() {
- return _map;
- }
-
- @Override
- public String getChromosome() {
- return _chromosome;
- }
-
- private static void x( final StringBuilder sb, final String s ) {
- if ( sb.length() > 0 ) {
- sb.append( " " );
- }
- sb.append( s.trim() );
- }
+ private SortedSet<Annotation> _annotations;
+ private String _chromosome;
+ private SortedSet<Accession> _cross_references;
+ private String _de;
+ private String _gene_name;
+ private String _map;
+ private String _os;
// FIXME actually this is NCBI entry
//http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
private String _pa;
- private String _de;
- private String _os;
- private String _tax_id;
- private String _symbol;
private String _provider;
- private SortedSet<Accession> _cross_references;
- private SortedSet<Annotation> _annotations;
- private String _gene_name;
+ private String _symbol;
+ private String _tax_id;
// TODO PUBMED 15798186
//TODO (FEATURES)
private EbiDbEntry() {
}
- private void addCrossReference( final Accession accession ) {
- if ( _cross_references == null ) {
- _cross_references = new TreeSet<Accession>();
- }
- System.out.println( "XREF ADDED: " + accession );
- _cross_references.add( accession );
- }
-
@Override
public Object clone() throws CloneNotSupportedException {
throw new CloneNotSupportedException();
}
@Override
+ public SortedSet<Annotation> getAnnotations() {
+ return _annotations;
+ }
+
+ @Override
+ public String getChromosome() {
+ return _chromosome;
+ }
+
+ @Override
public SortedSet<Accession> getCrossReferences() {
return _cross_references;
}
}
@Override
+ public String getMap() {
+ return _map;
+ }
+
+ @Override
public String getProvider() {
return _provider;
}
return _symbol;
}
- private void setSequenceSymbol( final String symbol ) {
- _symbol = symbol;
- }
-
@Override
public String getTaxonomyIdentifier() {
return _tax_id;
&& ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) );
}
- private void setSequenceName( final String rec_name ) {
- if ( _de == null ) {
- _de = rec_name;
+ public void setProvider( final String provider ) {
+ _provider = provider;
+ }
+
+ private void addAnnotation( final Annotation annotation ) {
+ if ( _annotations == null ) {
+ _annotations = new TreeSet<Annotation>();
}
+ _annotations.add( annotation );
+ }
+
+ private void addCrossReference( final Accession accession ) {
+ if ( _cross_references == null ) {
+ _cross_references = new TreeSet<Accession>();
+ }
+ System.out.println( "XREF ADDED: " + accession );
+ _cross_references.add( accession );
+ }
+
+ private void setAccession( final String pa ) {
+ if ( _pa == null ) {
+ _pa = pa;
+ }
+ }
+
+ private void setChromosome( final String chromosome ) {
+ _chromosome = chromosome;
}
private void setGeneName( final String gene_name ) {
}
}
- private void setTaxonomyScientificName( final String os ) {
- if ( _os == null ) {
- _os = os;
- }
+ private void setMap( final String map ) {
+ _map = map;
}
- private void setAccession( final String pa ) {
- if ( _pa == null ) {
- _pa = pa;
+ private void setSequenceName( final String rec_name ) {
+ if ( _de == null ) {
+ _de = rec_name;
}
}
- public void setProvider( final String provider ) {
- _provider = provider;
+ private void setSequenceSymbol( final String symbol ) {
+ _symbol = symbol;
}
private void setTaxId( final String tax_id ) {
}
}
- @Override
- public SortedSet<Annotation> getAnnotations() {
- return _annotations;
+ private void setTaxonomyScientificName( final String os ) {
+ if ( _os == null ) {
+ _os = os;
+ }
}
- private void addAnnotation( final Annotation annotation ) {
- if ( _annotations == null ) {
- _annotations = new TreeSet<Annotation>();
+ // public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
+ // final EbiDbEntry e = new EbiDbEntry();
+ // for( final String line : lines ) {
+ // if ( line.startsWith( "PA" ) ) {
+ // e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
+ // }
+ // else if ( line.startsWith( "DE" ) ) {
+ // e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
+ // }
+ // else if ( line.startsWith( "OS" ) ) {
+ // if ( line.indexOf( "(" ) > 0 ) {
+ // e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
+ // }
+ // else {
+ // e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
+ // }
+ // }
+ // else if ( line.startsWith( "OX" ) ) {
+ // if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
+ // e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
+ // }
+ // }
+ // }
+ // return e;
+ // }
+ public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
+ final Pattern X_PATTERN = Pattern.compile( "^[A-Z]+" );
+ final Pattern chromosome_PATTERN = Pattern.compile( "\\s+/chromosome=\"(\\w+)\"" );
+ final Pattern map_PATTERN = Pattern.compile( "\\s+/map=\"([\\w+\\.])\"" );
+ final Pattern gene_PATTERN = Pattern.compile( "\\s+/gene=\"(.+)\"" );
+ final Pattern mim_PATTERN = Pattern.compile( "\\s+/db_xref=\"MIM:(\\d+)\"" );
+ final Pattern taxon_PATTERN = Pattern.compile( "\\s+/db_xref=\"taxon:(\\d+)\"" );
+ final Pattern interpro_PATTERN = Pattern.compile( "\\s+/db_xref=\"InterPro:([A-Z0-9]+)\"" );
+ final Pattern uniprot_PATTERN = Pattern.compile( "\\s+/db_xref=\"UniProtKB/[A-Za-z-]*:(\\w+)\"" );
+ final Pattern hgnc_PATTERN = Pattern.compile( "\\s+/db_xref=\"[A-Z:]*HGNC:(\\d+)\"" );
+ final Pattern geneid_PATTERN = Pattern.compile( "\\s+/db_xref=\"GeneID:(\\d+)\"" );
+ final Pattern pdb_PATTERN = Pattern.compile( "\\s+/db_xref=\"PDB:([A-Z0-9]+)\"" );
+ final Pattern ec_PATTERN = Pattern.compile( "\\s+/EC_number=\"([\\.\\-\\d]+)\"" );
+ final Pattern product_PATTERN = Pattern.compile( "\\s+/product=\"(\\w{1,10})\"" );
+ final EbiDbEntry e = new EbiDbEntry();
+ final StringBuilder def = new StringBuilder();
+ boolean in_definition = false;
+ boolean in_features = false;
+ boolean in_source = false;
+ boolean in_gene = false;
+ boolean in_cds = false;
+ boolean in_mrna = false;
+ boolean in_protein = false;
+ for( final String line : lines ) {
+ if ( line.startsWith( "ACCESSION " ) ) {
+ e.setAccession( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
+ in_definition = false;
+ }
+ else if ( line.startsWith( "ID " ) ) {
+ e.setAccession( SequenceDbWsTools.extractFromTo( line, "ID", ";" ) );
+ in_definition = false;
+ }
+ else if ( line.startsWith( "DEFINITION " ) || ( line.startsWith( "DE " ) ) ) {
+ boolean definiton = false;
+ if ( line.startsWith( "DEFINITION " ) ) {
+ definiton = true;
+ }
+ if ( line.indexOf( "[" ) > 0 ) {
+ if ( definiton ) {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) ) );
+ }
+ else {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "[" ) ) );
+ }
+ }
+ else if ( line.indexOf( "." ) > 0 ) {
+ if ( definiton ) {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) ) );
+ }
+ else {
+ x( def, ( SequenceDbWsTools.extractFromTo( line, "DE", "." ) ) );
+ }
+ }
+ else {
+ if ( definiton ) {
+ x( def, ( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) ) );
+ }
+ else {
+ x( def, ( SequenceDbWsTools.extractFrom( line, "DE" ) ) );
+ }
+ }
+ if ( definiton ) {
+ in_definition = true;
+ }
+ }
+ else if ( line.startsWith( " ORGANISM " ) ) {
+ if ( line.indexOf( "(" ) > 0 ) {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, " ORGANISM", "(" ) );
+ }
+ else {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, " ORGANISM" ) );
+ }
+ // in_def = false;
+ }
+ else if ( line.startsWith( "OS " ) ) {
+ if ( line.indexOf( "(" ) > 0 ) {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
+ }
+ else {
+ e.setTaxonomyScientificName( SequenceDbWsTools.extractFrom( line, "OS" ) );
+ }
+ }
+ else if ( line.startsWith( " " ) && in_definition ) {
+ def.append( " " );
+ if ( line.indexOf( "[" ) > 0 ) {
+ def.append( SequenceDbWsTools.extractTo( line, "[" ) );
+ }
+ else if ( line.indexOf( "." ) > 0 ) {
+ def.append( SequenceDbWsTools.extractTo( line, "." ) );
+ }
+ else {
+ def.append( line.trim() );
+ }
+ }
+ else {
+ in_definition = false;
+ }
+ if ( !line.startsWith( "FT " ) && X_PATTERN.matcher( line ).find() ) {
+ in_features = false;
+ in_source = false;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = false;
+ // in_def = false;
+ }
+ if ( line.startsWith( "FEATURES " ) || line.startsWith( "FT " ) ) {
+ in_features = true;
+ }
+ if ( in_features && ( line.startsWith( " source " ) || line.startsWith( "FT source " ) ) ) {
+ in_source = true;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = false;
+ }
+ if ( in_features && ( line.startsWith( " gene " ) || line.startsWith( "FT gene " ) ) ) {
+ in_source = false;
+ in_gene = true;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = false;
+ }
+ if ( in_features && ( line.startsWith( " CDS " ) || line.startsWith( "FT CDS " ) ) ) {
+ in_source = false;
+ in_gene = false;
+ in_cds = true;
+ in_mrna = false;
+ in_protein = false;
+ }
+ if ( in_features && ( line.startsWith( " Protein " ) || line.startsWith( "FT Protein " ) ) ) {
+ in_source = false;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = false;
+ in_protein = true;
+ }
+ if ( in_features && ( line.startsWith( " mRNA " ) || line.startsWith( "FT mRNA " ) ) ) {
+ in_source = false;
+ in_gene = false;
+ in_cds = false;
+ in_mrna = true;
+ in_protein = false;
+ }
+ if ( in_source ) {
+ final Matcher ti = taxon_PATTERN.matcher( line );
+ if ( ti.find() ) {
+ e.setTaxId( ti.group( 1 ) );
+ }
+ final Matcher chr = chromosome_PATTERN.matcher( line );
+ if ( chr.find() ) {
+ e.setChromosome( chr.group( 1 ) );
+ }
+ final Matcher map = map_PATTERN.matcher( line );
+ if ( map.find() ) {
+ e.setMap( map.group( 1 ) );
+ }
+ }
+ if ( in_cds || in_gene ) {
+ final Matcher hgnc = hgnc_PATTERN.matcher( line );
+ if ( hgnc.find() ) {
+ e.addCrossReference( new Accession( hgnc.group( 1 ), "hgnc" ) );
+ }
+ final Matcher geneid = geneid_PATTERN.matcher( line );
+ if ( geneid.find() ) {
+ e.addCrossReference( new Accession( geneid.group( 1 ), "geneid" ) );
+ }
+ }
+ if ( in_protein || in_cds || in_gene || in_mrna ) {
+ final Matcher ec = ec_PATTERN.matcher( line );
+ if ( ec.find() ) {
+ e.addAnnotation( new Annotation( "EC", ec.group( 1 ) ) );
+ }
+ final Matcher gene = gene_PATTERN.matcher( line );
+ if ( gene.find() ) {
+ e.setGeneName( gene.group( 1 ) );
+ }
+ final Matcher uniprot = uniprot_PATTERN.matcher( line );
+ if ( uniprot.find() ) {
+ e.addCrossReference( new Accession( uniprot.group( 1 ), "uniprot" ) );
+ }
+ final Matcher interpro = interpro_PATTERN.matcher( line );
+ if ( interpro.find() ) {
+ e.addCrossReference( new Accession( interpro.group( 1 ), "interpro" ) );
+ }
+ final Matcher mim = mim_PATTERN.matcher( line );
+ if ( mim.find() ) {
+ e.addCrossReference( new Accession( mim.group( 1 ), "mim" ) );
+ }
+ final Matcher product = product_PATTERN.matcher( line );
+ if ( product.find() ) {
+ e.setSequenceSymbol( product.group( 1 ) );
+ }
+ final Matcher pdb = pdb_PATTERN.matcher( line );
+ if ( pdb.find() ) {
+ e.addCrossReference( new Accession( pdb.group( 1 ), "pdb" ) );
+ }
+ }
}
- _annotations.add( annotation );
+ if ( def.length() > 0 ) {
+ e.setSequenceName( def.toString().trim() );
+ }
+ return e;
+ }
+
+ private static void x( final StringBuilder sb, final String s ) {
+ if ( sb.length() > 0 ) {
+ sb.append( " " );
+ }
+ sb.append( s.trim() );
}
}