}
private static boolean testUniprotEntryRetrieval() {
- if ( !SequenceDbWsTools.parseUniProtAccessor( "P12345" ).equals( "P12345" ) ) {
- return false;
- }
- if ( SequenceDbWsTools.parseUniProtAccessor( "EP12345" ) != null ) {
- return false;
- }
- if ( SequenceDbWsTools.parseUniProtAccessor( "3 4P12345" ) != null ) {
- return false;
- }
- if ( SequenceDbWsTools.parseUniProtAccessor( "P12345E" ) != null ) {
- return false;
- }
- if ( SequenceDbWsTools.parseUniProtAccessor( "P123455" ) != null ) {
- return false;
- }
- if ( SequenceDbWsTools.parseUniProtAccessor( "EP12345E" ) != null ) {
- return false;
- }
- if ( SequenceDbWsTools.parseUniProtAccessor( "AY423861" ) != null ) {
- return false;
- }
- if ( !SequenceDbWsTools.parseUniProtAccessor( "P1DDD5" ).equals( "P1DDD5" ) ) {
- return false;
- }
- if ( SequenceDbWsTools.parseUniProtAccessor( "P1DDDD" ) != null ) {
- return false;
- }
- if ( !SequenceDbWsTools.parseUniProtAccessor( "P1234X/P12345/12-42" ).equals( "P12345" ) ) {
- return false;
- }
- if ( !SequenceDbWsTools.parseUniProtAccessor( "P1234X P12345 12-42" ).equals( "P12345" ) ) {
- return false;
- }
- if ( !SequenceDbWsTools.parseUniProtAccessor( "P12345/12-42" ).equals( "P12345" ) ) {
- return false;
- }
- if ( !SequenceDbWsTools.parseUniProtAccessor( "P1234X/P12345" ).equals( "P12345" ) ) {
- return false;
- }
try {
final SequenceDatabaseEntry entry = SequenceDbWsTools.obtainUniProtEntry( "P12345", 200 );
if ( !entry.getAccession().equals( "P12345" ) ) {
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
public final class SequenceDbWsTools {
- private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease!
+ private static final boolean ALLOW_TAXONOMY_CODE_HACKS = true; //TODO turn off for final realease!
public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/";
public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/";
public final static String EMBL_DBS_EMBL = "embl";
public final static String EMBL_DBS_REFSEQ_P = "refseqp";
public final static String EMBL_DBS_REFSEQ_N = "refseqn";
private final static String URL_ENC = "UTF-8";
- // uniprot/expasy accession number format (6 chars):
- // letter digit letter-or-digit letter-or-digit letter-or-digit digit
- // ?: => no back-reference
- // \A => begin of String
- // \Z => end of String
- private final static Pattern UNIPROT_AC_PATTERN = Pattern
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d[A-Z0-9]{3}\\d)(?:[^a-zA-Z0-9]|\\Z)" );
private final static boolean DEBUG = false;
public static List<UniProtTaxonomy> getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return )
db = Db.EMBL;
}
else if ( !ForesterUtil.isEmpty( node.getName() ) ) {
- if ( ( query = parseUniProtAccessor( node.getName() ) ) != null ) {
+ if ( ( query = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) {
db = Db.UNIPROT;
}
else if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) {
}
else if ( ( db == Db.NCBI ) && ( id != null ) ) {
db_entry = obtainEmblEntry( id, lines_to_return );
- if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
- final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()
- : new Sequence();
- if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
- String type = null;
- if ( db == Db.EMBL ) {
- type = "embl";
- }
- else if ( db == Db.UNIPROT ) {
- type = "uniprot";
- }
- else if ( db == Db.NCBI ) {
- type = "ncbi";
- }
- else if ( db == Db.REFSEQ ) {
- type = "refseq";
- }
- seq.setAccession( new Accession( db_entry.getAccession(), type ) );
- }
- if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
- seq.setName( db_entry.getSequenceName() );
+ }
+ if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
+ final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()
+ : new Sequence();
+ if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
+ String type = null;
+ if ( db == Db.EMBL ) {
+ type = "embl";
}
- if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
- seq.setSymbol( db_entry.getSequenceSymbol() );
+ else if ( db == Db.UNIPROT ) {
+ type = "uniprot";
}
- final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy()
- : new Taxonomy();
- if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
- tax.setScientificName( db_entry.getTaxonomyScientificName() );
+ else if ( db == Db.NCBI ) {
+ type = "ncbi";
}
- if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
- tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
+ else if ( db == Db.REFSEQ ) {
+ type = "refseq";
}
- node.getNodeData().setTaxonomy( tax );
- node.getNodeData().setSequence( seq );
+ seq.setAccession( new Accession( db_entry.getAccession(), type ) );
}
- else if ( db != Db.NONE ) {
- not_found.add( node.getName() );
+ if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
+ seq.setName( db_entry.getSequenceName() );
}
- try {
- Thread.sleep( 10 );// Sleep for 10 ms
+ if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
+ seq.setSymbol( db_entry.getSequenceSymbol() );
}
- catch ( final InterruptedException ie ) {
+ final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy()
+ : new Taxonomy();
+ if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
+ tax.setScientificName( db_entry.getTaxonomyScientificName() );
}
+ if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
+ tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
+ }
+ node.getNodeData().setTaxonomy( tax );
+ node.getNodeData().setSequence( seq );
+ }
+ else if ( db != Db.NONE ) {
+ not_found.add( node.getName() );
+ }
+ try {
+ Thread.sleep( 10 );// Sleep for 10 ms
+ }
+ catch ( final InterruptedException ie ) {
}
}
return not_found;
return UniProtEntry.createInstanceFromPlainText( lines );
}
- /**
- * Returns null if no match.
- *
- * @param query
- * @param db
- * @return
- */
- static public String parseUniProtAccessor( final String query ) {
- final Matcher m = UNIPROT_AC_PATTERN.matcher( query );
- if ( m.lookingAt() ) {
- return m.group( 1 );
- }
- else {
- return null;
- }
- }
-
public static List<String> queryDb( final String query, int max_lines_to_return, final String base_url )
throws IOException {
if ( ForesterUtil.isEmpty( query ) ) {
public final class UniProtEntry implements SequenceDatabaseEntry {
private String _ac;
- private String _rec_name;
+ private String _name;
private String _os_scientific_name;
private String _tax_id;
- private String _symbol;
private UniProtEntry() {
}
if ( line.startsWith( "AC" ) ) {
e.setAc( DatabaseTools.extract( line, "AC", ";" ) );
}
- else if ( line.startsWith( "DE" ) ) {
+ else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceName() ) ) {
if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
- e.setRecName( DatabaseTools.extract( line, "Full=", ";" ) );
+ e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) );
}
- }
- else if ( line.startsWith( "GN" ) ) {
- if ( ( line.indexOf( "Name=" ) > 0 ) ) {
- e.setSymbol( DatabaseTools.extract( line, "Name=", ";" ) );
+ else if ( ( line.indexOf( "SubName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
+ e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) );
}
}
else if ( line.startsWith( "OS" ) ) {
@Override
public String getSequenceName() {
- return _rec_name;
+ return _name;
}
- private void setRecName( final String rec_name ) {
- if ( _rec_name == null ) {
- _rec_name = rec_name;
+ private void setSequenceName( final String name ) {
+ if ( _name == null ) {
+ _name = name;
}
}
@Override
public String getSequenceSymbol() {
- return _symbol;
- }
-
- private void setSymbol( final String symbol ) {
- if ( _symbol == null ) {
- _symbol = symbol;
- }
+ return "";
}
@Override