import org.forester.archaeopteryx.MainFrameApplication;
import org.forester.archaeopteryx.TreePanel;
+import org.forester.archaeopteryx.tools.AncestralTaxonomyInferrer;
import org.forester.archaeopteryx.tools.RunnableProcess;
import org.forester.io.parsers.phyloxml.PhyloXmlDataFormatException;
import org.forester.phylogeny.Phylogeny;
}
private final String getBaseUrl() {
- return UniProtWsTools.BASE_URL;
+ return AncestralTaxonomyInferrer.getBaseUrl();
}
@Override
_obtain_detailed_taxonomic_information_deleting_jmi
.setToolTipText( "To add additional taxonomic information, deletes nodes for which taxonomy cannot found (from UniProt Taxonomy)" );
_tools_menu
- .add( _obtain_uniprot_seq_information_jmi = new JMenuItem( "Obtain Sequence Information (from UniProt)" ) );
+ .add( _obtain_uniprot_seq_information_jmi = new JMenuItem( "Obtain Sequence Information" ) );
customizeJMenuItem( _obtain_uniprot_seq_information_jmi );
- _obtain_uniprot_seq_information_jmi.setToolTipText( "To add additional sequence information (from UniProt)" );
+ _obtain_uniprot_seq_information_jmi.setToolTipText( "To add additional sequence information" );
_tools_menu.addSeparator();
if ( !Constants.__RELEASE ) {
_tools_menu.add( _function_analysis = new JMenuItem( "Add UniProtKB Annotations" ) );
_treepanel = treepanel;
}
- private String getBaseUrl() {
- return UniProtWsTools.BASE_URL;
+ public static String getBaseUrl() {
+ return UniProtWsTools.BASE_UNIPROT_URL;
}
private void inferTaxonomies() {
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.ForesterUtil;
import org.forester.util.SequenceIdParser;
-import org.forester.ws.uniprot.DatabaseTools;
import org.forester.ws.uniprot.SequenceDatabaseEntry;
import org.forester.ws.uniprot.UniProtWsTools;
private final Phylogeny _phy;
private final MainFrameApplication _mf;
private final TreePanel _treepanel;
- private final static boolean DEBUG = false;
+ private final static boolean DEBUG = true;
private enum Db {
UNKNOWN, UNIPROT, EMBL, NCBI;
_treepanel = treepanel;
}
- private String getBaseUrl() {
- return UniProtWsTools.BASE_URL;
- }
-
private void execute() {
start( _mf, "sequence data" );
SortedSet<String> not_found = null;
not_found = obtainSeqInformation( _phy );
}
catch ( final UnknownHostException e ) {
+ final String what = "_"; //TODO FIXME
JOptionPane.showMessageDialog( _mf,
- "Could not connect to \"" + getBaseUrl() + "\"",
+ "Could not connect to \"" + what + "\"",
"Network error during taxonomic information gathering",
JOptionPane.ERROR_MESSAGE );
return;
tax = new Taxonomy();
}
String query = null;
+ Identifier id = null;
Db db = Db.UNKNOWN;
if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
&& !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
if ( ( query = UniProtWsTools.parseUniProtAccessor( node.getName() ) ) != null ) {
db = Db.UNIPROT;
}
- else if ( ( query = SequenceIdParser.parseGenbankAccessor( node.getName() ) ) != null ) {
+ else if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) {
db = Db.NCBI;
}
}
+ SequenceDatabaseEntry db_entry = null;
if ( !ForesterUtil.isEmpty( query ) ) {
- SequenceDatabaseEntry db_entry = null;
if ( db == Db.UNIPROT ) {
if ( DEBUG ) {
System.out.println( "uniprot: " + query );
db = Db.EMBL;
}
}
- if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
- if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
- String type = null;
- if ( db == Db.EMBL ) {
- type = "embl";
- }
- else if ( db == Db.UNIPROT ) {
- type = "uniprot";
- }
- seq.setAccession( new Accession( db_entry.getAccession(), type ) );
- }
- if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
- seq.setName( db_entry.getSequenceName() );
- }
- if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
- seq.setSymbol( db_entry.getSequenceSymbol() );
- }
- if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
- tax.setScientificName( db_entry.getTaxonomyScientificName() );
+ }
+ else if ( ( db == Db.NCBI ) && ( id != null ) ) {
+ System.out.println( "db == Db.NCBI && id != null" );
+ db_entry = UniProtWsTools.obtainrefSeqentryFromEmbl( id, 200 );
+ }
+ if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
+ if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
+ String type = null;
+ if ( db == Db.EMBL ) {
+ type = "embl";
}
- if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
- tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
+ else if ( db == Db.UNIPROT ) {
+ type = "uniprot";
}
- node.getNodeData().setTaxonomy( tax );
- node.getNodeData().setSequence( seq );
+ seq.setAccession( new Accession( db_entry.getAccession(), type ) );
+ }
+ if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
+ seq.setName( db_entry.getSequenceName() );
+ }
+ if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
+ seq.setSymbol( db_entry.getSequenceSymbol() );
}
- else {
- not_found.add( node.getName() );
+ if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
+ tax.setScientificName( db_entry.getTaxonomyScientificName() );
}
+ if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
+ tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
+ }
+ node.getNodeData().setTaxonomy( tax );
+ node.getNodeData().setSequence( seq );
+ }
+ else {
+ not_found.add( node.getName() );
}
}
return not_found;
public final class Identifier implements PhylogenyData {
+ final public static String NCBI = "ncbi";
+ final public static String REFSEQ = "refseq";
+
final private String _value;
final private String _provider;
final private String _value_provider;
+
+
public Identifier() {
_value = "";
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "XP_002434188" )
- || !id.getProvider().equals( "ncbi" ) ) {
+ || !id.getProvider().equals( "refseq" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "XP_002434188" )
- || !id.getProvider().equals( "ncbi" ) ) {
+ || !id.getProvider().equals( "refseq" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
\r
\r
private final static boolean DEBUG = true;\r
+ \r
+ \r
\r
\r
/**\r
public final static Identifier parse( final String s ) {\r
String v = parseGenbankAccessor( s );\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Identifier( v, "ncbi" );\r
+ return new Identifier( v, Identifier.NCBI );\r
}\r
v = parseRefSeqAccessor( s );\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Identifier( v, "ncbi" );\r
+ return new Identifier( v, Identifier.REFSEQ );\r
}\r
return null;\r
}\r
* Returns null if no match.\r
* \r
*/\r
- static public String parseGenbankAccessor( final String query ) {\r
+ public static String parseGenbankAccessor( final String query ) {\r
Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
if ( m.lookingAt() ) {\r
return m.group( 1 );\r
* Returns null if no match.\r
* \r
*/\r
- public final static String parseRefSeqAccessor( final String query ) {\r
+ private final static String parseRefSeqAccessor( final String query ) {\r
Matcher m = REFSEQ_PATTERN.matcher( query );\r
if ( m.lookingAt() ) {\r
return m.group( 1 );\r
throw new CloneNotSupportedException();
}
+
+ public static SequenceDatabaseEntry createInstanceForRefSeq( final List<String> lines ) {
+ final EbiDbEntry e = new EbiDbEntry();
+ for( final String line : lines ) {
+ System.out.println( "-" + line );
+ if ( line.startsWith( "ACCESSION" ) ) {
+ e.setPA( DatabaseTools.extract( line, "ACCESSION" ) );
+ }
+ else if ( line.startsWith( "DEFINITION" ) ) {
+ if ( line.indexOf( "[" ) > 0 ) {
+ e.setDe( DatabaseTools.extract( line, "DEFINITIO", "[" ) );
+ }
+ else {
+ e.setDe( DatabaseTools.extract( line, "DEFINITION" ) );
+ }
+
+
+ }
+
+ else if ( line.startsWith( "SOURCE" ) ) {
+ if ( line.indexOf( "(" ) > 0 ) {
+ e.setOs( DatabaseTools.extract( line, "SOURCE", "(" ) );
+ }
+ else {
+ e.setOs( DatabaseTools.extract( line, "SOURCE" ) );
+ }
+ }
+
+ }
+ return e;
+ }
+
+
+
public static SequenceDatabaseEntry createInstanceFromPlainText( final List<String> lines ) {
final EbiDbEntry e = new EbiDbEntry();
for( final String line : lines ) {
+ System.out.println( "->" + line );
if ( line.startsWith( "PA" ) ) {
e.setPA( DatabaseTools.extract( line, "PA" ) );
}
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.forester.phylogeny.data.Identifier;
import org.forester.util.ForesterUtil;
public final class UniProtWsTools {
public enum Db {
UNKNOWN, UNIPROT;
}
- public final static String BASE_URL = "http://www.uniprot.org/";
- public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/embl/";
+ public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/";
+ public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/";
+ public final static String EMBL_DBS_EMBL = "embl";
+ public final static String EMBL_DBS_REFSEQ_P = "refseqp";
+ public final static String EMBL_DBS_REFSEQ_N = "refseqn";
+
private final static String URL_ENC = "UTF-8";
// uniprot/expasy accession number format (6 chars):
// letter digit letter-or-digit letter-or-digit letter-or-digit digit
return taxonomies;
}
- public static List<String> queryEmblDb( final String query, final int max_lines_to_return ) throws IOException {
- return queryDb( query, max_lines_to_return, BASE_EMBL_DB_URL );
+ public static List<String> queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException {
+
+ StringBuilder url_sb = new StringBuilder();
+ url_sb.append( BASE_EMBL_DB_URL );
+
+ if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
+ url_sb.append( '/');
+ url_sb.append( UniProtWsTools.EMBL_DBS_EMBL );
+ url_sb.append( '/');
+ }
+ else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
+ if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) {
+ url_sb.append( '/');
+ url_sb.append( UniProtWsTools.EMBL_DBS_REFSEQ_P );
+ url_sb.append( '/');
+ }
+ else {
+ url_sb.append( '/');
+ url_sb.append( UniProtWsTools.EMBL_DBS_REFSEQ_N );
+ url_sb.append( '/');
+ }
+ }
+ return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
}
public static List<String> queryUniprot( final String query, final int max_lines_to_return ) throws IOException {
- return queryDb( query, max_lines_to_return, BASE_URL );
+ return queryDb( query, max_lines_to_return, BASE_UNIPROT_URL );
}
public static List<String> queryDb( final String query, int max_lines_to_return, final String base_url )
return UniProtEntry.createInstanceFromPlainText( lines );
}
+ public static SequenceDatabaseEntry obtainrefSeqentryFromEmbl( final Identifier id, final int max_lines_to_return )
+ throws IOException {
+ final List<String> lines = queryEmblDb( id, max_lines_to_return );
+ return EbiDbEntry.createInstanceForRefSeq( lines );
+ }
+
public static SequenceDatabaseEntry obtainEmblEntry( final String query, final int max_lines_to_return )
throws IOException {
- final List<String> lines = queryEmblDb( query, max_lines_to_return );
+ final List<String> lines = queryEmblDb( new Identifier( query ), max_lines_to_return );
return EbiDbEntry.createInstanceFromPlainText( lines );
}
}