sb.append( "For the following node no data was found:\n" );
}
else {
- sb.append( "For the following nodes no data was found: (total: " + not_found.size() + "):\n" );
+ sb.append( "For the following nodes no data was found (total: " + not_found.size() + "):\n" );
}
int i = 0;
for( final String string : not_found ) {
public final class Accession implements PhylogenyData, Comparable<Accession> {
- final private String _comment;
- final private String _source;
- final private String _source_value;
- final private String _value;
- final public static String NCBI = "ncbi";
- final public static String REFSEQ = "refseq";
- final public static String UNIPROT = "uniprot";
- final public static String GI = "gi";
- public static final String EMBL = "embl";
+ final private String _comment;
+ final private String _source;
+ final private String _source_value;
+ final private String _value;
+
+ public enum Source {
+ NCBI, REFSEQ, UNIPROT, GI, EMBL, UNKNOWN;
+
+ @Override
+ public String toString() {
+ switch ( this ) {
+ case NCBI:
+ return "ncbi";
+ case REFSEQ:
+ return "refseq";
+ case UNIPROT:
+ return "uniprot";
+ case GI:
+ return "gi";
+ case EMBL:
+ return "embl";
+ case UNKNOWN:
+ return "unknown";
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+ }
public Accession( final String value ) {
_value = value;
}
}
+ public Accession( final String value, final Source source ) {
+ _value = value;
+ _source = source.toString();
+ _comment = "";
+ _source_value = source + value;
+ }
+
public Accession( final String value, final String source, final String comment ) {
_value = value;
_source = source;
import java.util.Locale;
import java.util.Set;
import java.util.SortedSet;
+import java.util.TreeSet;
import org.forester.application.support_transfer;
import org.forester.archaeopteryx.TreePanelUtil;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Accession.Source;
import org.forester.phylogeny.data.BinaryCharacters;
import org.forester.phylogeny.data.BranchWidth;
import org.forester.phylogeny.data.Confidence;
System.exit( -1 );
}
final long start_time = new Date().getTime();
+ System.out.print( "Basic node methods: " );
+ if ( Test.testBasicNodeMethods() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
System.out.print( "Protein id: " );
if ( !testProteinId() ) {
System.out.println( "failed." );
System.out.println( "failed." );
failed++;
}
- System.out.print( "Hmmscan output parser: " );
- if ( testHmmscanOutputParser() ) {
+ System.out.print( "UniProtKB id extraction: " );
+ if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
System.out.println( "OK." );
succeeded++;
}
System.out.println( "failed." );
failed++;
}
- System.out.print( "Basic node methods: " );
- if ( Test.testBasicNodeMethods() ) {
+ System.out.print( "Sequence DB tools 1: " );
+ if ( testSequenceDbWsTools1() ) {
System.out.println( "OK." );
succeeded++;
}
System.out.println( "failed." );
failed++;
}
- System.out.print( "Taxonomy code extraction: " );
- if ( Test.testExtractTaxonomyCodeFromNodeName() ) {
+ System.out.print( "Sequence DB tools 2: " );
+ if ( testSequenceDbWsTools2() ) {
System.out.println( "OK." );
succeeded++;
}
else {
System.out.println( "failed." );
failed++;
+ System.exit( -1 );
}
- System.out.print( "SN extraction: " );
- if ( Test.testExtractSNFromNodeName() ) {
+ System.exit( 0 );
+ System.out.print( "Hmmscan output parser: " );
+ if ( testHmmscanOutputParser() ) {
System.out.println( "OK." );
succeeded++;
}
System.out.println( "failed." );
failed++;
}
- System.out.print( "Taxonomy extraction (general): " );
- if ( Test.testTaxonomyExtraction() ) {
+ System.out.print( "Taxonomy code extraction: " );
+ if ( Test.testExtractTaxonomyCodeFromNodeName() ) {
System.out.println( "OK." );
succeeded++;
}
System.out.println( "failed." );
failed++;
}
- System.out.print( "UniProtKB id extraction: " );
- if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
+ System.out.print( "SN extraction: " );
+ if ( Test.testExtractSNFromNodeName() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
+ System.out.print( "Taxonomy extraction (general): " );
+ if ( Test.testTaxonomyExtraction() ) {
System.out.println( "OK." );
succeeded++;
}
return false;
}
n = new PhylogenyNode();
- n.setName( "_ACP19736_" );
+ n.setName( "|ACP19736|" );
if ( !SequenceAccessionTools.obtainGenbankAccessorFromDataFields( n ).equals( "ACP19736" ) ) {
return false;
}
}
return false;
}
- //
- // id = SequenceAccessionTools.parse( "pllf[pok P4A123_osdjfosnqo035-9233332904i000490 vf tmv x45" );
- // if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
- // || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "sp" ) ) {
- // if ( id != null ) {
- // System.out.println( "value =" + id.getValue() );
- // System.out.println( "provider=" + id.getSource() );
- // }
- // return false;
- // }
- //
id = SequenceAccessionTools.parseAccessorFromString( "XP_12345" );
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getSource() );
return false;
}
- // lcl_91970_unknown_
}
catch ( final Exception e ) {
e.printStackTrace( System.out );
return true;
}
+ private static boolean testSequenceDbWsTools1() {
+ try {
+ PhylogenyNode n = new PhylogenyNode();
+ n.setName( "NP_001025424" );
+ Accession acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+ || !acc.getValue().equals( "NP_001025424" ) ) {
+ return false;
+ }
+ n.setName( "340 0559 -- _NP_001025424_dsfdg15 05" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+ || !acc.getValue().equals( "NP_001025424" ) ) {
+ return false;
+ }
+ n.setName( "NP_001025424.1" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+ || !acc.getValue().equals( "NP_001025424" ) ) {
+ return false;
+ }
+ n.setName( "NM_001030253" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+ || !acc.getValue().equals( "NM_001030253" ) ) {
+ return false;
+ }
+ n.setName( "BCL2_HUMAN" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+ || !acc.getValue().equals( "BCL2_HUMAN" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "P10415" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+ || !acc.getValue().equals( "P10415" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( " P10415 " );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+ || !acc.getValue().equals( "P10415" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "_P10415|" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+ || !acc.getValue().equals( "P10415" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "AY695820" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+ || !acc.getValue().equals( "AY695820" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "_AY695820_" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+ || !acc.getValue().equals( "AY695820" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "AAA59452" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+ || !acc.getValue().equals( "AAA59452" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "_AAA59452_" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+ || !acc.getValue().equals( "AAA59452" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "AAA59452.1" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+ || !acc.getValue().equals( "AAA59452.1" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "_AAA59452.1_" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+ || !acc.getValue().equals( "AAA59452.1" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ n.setName( "GI:94894583" );
+ acc = SequenceDbWsTools.obtainSeqAccession( n );
+ if ( acc == null || !acc.getSource().equals( Source.GI.toString() ) || !acc.getValue().equals( "94894583" ) ) {
+ System.out.println( acc.toString() );
+ return false;
+ }
+ }
+ // catch ( final IOException e ) {
+ // System.out.println();
+ // System.out.println( "the following might be due to absence internet connection:" );
+ // e.printStackTrace( System.out );
+ // return true;
+ // }
+ catch ( final Exception e ) {
+ return false;
+ }
+ return true;
+ }
+
+ private static boolean testSequenceDbWsTools2() {
+ try {
+ PhylogenyNode n1 = new PhylogenyNode();
+ n1.setName( "NP_001025424" );
+ SequenceDbWsTools.obtainSeqInformation( false, 4000, new TreeSet<String>(), n1 );
+ if ( !n1.getNodeData().getSequence().getName().equals( "Bcl2" ) ) {
+ return false;
+ }
+ if ( !n1.getNodeData().getTaxonomy().getScientificName().equals( "Danio rerio" ) ) {
+ return false;
+ }
+ PhylogenyNode n2 = new PhylogenyNode();
+ n2.setName( "NM_001030253" );
+ SequenceDbWsTools.obtainSeqInformation( false, 4000, new TreeSet<String>(), n2 );
+ System.out.println( n2.toString() );
+ if ( !n2.getNodeData().getSequence().getName()
+ .equals( "Danio rerio B-cell leukemia/lymphoma 2 (bcl2), mRNA" ) ) {
+ return false;
+ }
+ if ( !n2.getNodeData().getTaxonomy().getScientificName().equals( "Danio rerio" ) ) {
+ return false;
+ }
+ }
+ catch ( final IOException e ) {
+ System.out.println();
+ System.out.println( "the following might be due to absence internet connection:" );
+ e.printStackTrace( System.out );
+ return true;
+ }
+ catch ( final Exception e ) {
+ return false;
+ }
+ return true;
+ }
+
private static boolean testUniprotEntryRetrieval() {
try {
final SequenceDatabaseEntry entry = SequenceDbWsTools.obtainUniProtEntry( "P12345", 200 );
\r
import org.forester.phylogeny.PhylogenyNode;\r
import org.forester.phylogeny.data.Accession;\r
+import org.forester.phylogeny.data.Accession.Source;\r
import org.forester.phylogeny.data.Sequence;\r
\r
public final class SequenceAccessionTools {\r
public final static Pattern GENBANK_PROT_PATTERN = Pattern\r
.compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
public final static Pattern GI_PATTERN = Pattern.compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
- public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern.compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" );\r
+ public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern\r
+ .compile( "(?:\\b|_)([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern\r
.compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern\r
public final static Accession obtainAccessorFromDataFields( final PhylogenyNode n ) {\r
String a = obtainUniProtAccessorFromDataFields( n );\r
if ( !ForesterUtil.isEmpty( a ) ) {\r
- return new Accession( a, Accession.UNIPROT );\r
+ return new Accession( a, Source.UNIPROT );\r
}\r
a = obtainGenbankAccessorFromDataFields( n );\r
if ( !ForesterUtil.isEmpty( a ) ) {\r
- return new Accession( a, Accession.NCBI );\r
+ return new Accession( a, Source.NCBI );\r
}\r
a = obtainRefSeqAccessorFromDataFields( n );\r
if ( !ForesterUtil.isEmpty( a ) ) {\r
- return new Accession( a, Accession.REFSEQ );\r
+ return new Accession( a, Source.REFSEQ );\r
}\r
a = obtainGiNumberFromDataFields( n );\r
if ( !ForesterUtil.isEmpty( a ) ) {\r
- return new Accession( a, Accession.GI );\r
+ return new Accession( a, Source.GI );\r
}\r
return null;\r
}\r
final String value = n.getNodeData().getSequence().getAccession().getValue();\r
if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source\r
.equals( "sp" ) ) ) {\r
- return new Accession( value, Accession.UNIPROT );\r
+ return new Accession( value, Source.UNIPROT );\r
}\r
else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) {\r
- return new Accession( value, Accession.EMBL );\r
+ return new Accession( value, Source.EMBL );\r
}\r
else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) {\r
- return new Accession( value, Accession.NCBI );\r
+ return new Accession( value, Source.NCBI );\r
}\r
else if ( source.equals( "refseq" ) ) {\r
- return new Accession( value, Accession.REFSEQ );\r
+ return new Accession( value, Source.REFSEQ );\r
}\r
else if ( source.equals( "gi" ) ) {\r
- return new Accession( value, Accession.GI );\r
+ return new Accession( value, Source.GI );\r
}\r
}\r
return null;\r
if ( !ForesterUtil.isEmpty( s ) ) {\r
String v = parseUniProtAccessorFromString( s );\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Accession( v, Accession.UNIPROT );\r
+ return new Accession( v, Source.UNIPROT );\r
}\r
v = parseGenbankAccessorFromString( s );\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Accession( v, Accession.NCBI );\r
+ return new Accession( v, Source.NCBI );\r
}\r
v = parseRefSeqAccessorFromString( s );\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Accession( v, Accession.REFSEQ );\r
+ return new Accession( v, Source.REFSEQ );\r
}\r
v = parseGInumberFromString( s );\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Accession( v, Accession.GI );\r
+ return new Accession( v, Source.GI );\r
}\r
}\r
return null;\r
}\r
\r
public final static String parseUniProtAccessorFromString( final String s ) {\r
- Matcher m = UNIPROT_KB_PATTERN_0.matcher( s );\r
- if ( m.find() ) {\r
- return m.group( 1 );\r
- }\r
- m = UNIPROT_KB_PATTERN_1.matcher( s );\r
+ Matcher m = UNIPROT_KB_PATTERN_1.matcher( s );\r
if ( m.find() ) {\r
return m.group( 1 );\r
}\r
if ( m.find() ) {\r
return m.group();\r
}\r
+ m = UNIPROT_KB_PATTERN_0.matcher( s );\r
+ if ( m.find() ) {\r
+ return m.group( 1 );\r
+ }\r
return null;\r
}\r
}\r
if ( line.indexOf( "[" ) > 0 ) {
e.setDe( DatabaseTools.extract( line, "DEFINITION", "[" ) );
}
+ else if ( line.indexOf( "." ) > 0 ) {
+ e.setDe( DatabaseTools.extract( line, "DEFINITION", "." ) );
+ }
else {
e.setDe( DatabaseTools.extract( line, "DEFINITION" ) );
}
import org.forester.phylogeny.Phylogeny;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Accession;
+import org.forester.phylogeny.data.Accession.Source;
import org.forester.phylogeny.data.Annotation;
import org.forester.phylogeny.data.Identifier;
import org.forester.phylogeny.data.Sequence;
public final class SequenceDbWsTools {
- public final static String BASE_EMBL_DB_URL = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/";
+ public final static String EMBL_REFSEQ = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id=";
public final static String BASE_UNIPROT_URL = "http://www.uniprot.org/";
public final static String EMBL_DBS_EMBL = "embl";
public final static String EMBL_DBS_REFSEQ_N = "refseqn";
return EbiDbEntry.createInstanceFromPlainText( lines );
}
- public final static Accession obtainFromSeqAccession( final PhylogenyNode node ) {
+ public final static Accession obtainSeqAccession( final PhylogenyNode node ) {
Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node );
if ( !isAccessionAcceptable( acc ) ) {
acc = SequenceAccessionTools.obtainAccessorFromDataFields( node );
public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return )
throws IOException {
- final List<String> lines = queryEmblDb( id, max_lines_to_return );
+ final List<String> lines = queryEmblDbForRefSeqEntry( id, max_lines_to_return );
return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
}
final int lines_to_return,
final SortedSet<String> not_found,
final PhylogenyNode node ) throws IOException {
- final Accession acc = obtainFromSeqAccession( node );
+ final Accession acc = obtainSeqAccession( node );
if ( !isAccessionAcceptable( acc ) ) {
if ( node.isExternal() || !node.isEmpty() ) {
not_found.add( node.toString() );
return result;
}
+ public static List<String> queryEmblDbForRefSeqEntry( final Accession id, final int max_lines_to_return )
+ throws IOException {
+ final StringBuilder url_sb = new StringBuilder();
+ url_sb.append( EMBL_REFSEQ );
+ return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
+ }
+
public static List<String> queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException {
final StringBuilder url_sb = new StringBuilder();
- url_sb.append( BASE_EMBL_DB_URL );
- if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource() == Accession.NCBI ) ) {
- url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL );
+ // url_sb.append( BASE_EMBL_DB_URL );
+ if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource().equals( Source.NCBI.toString() ) ) ) {
+ url_sb.append( EMBL_DBS_EMBL );
url_sb.append( '/' );
}
- else if ( id.getSource() == Accession.REFSEQ ) {
- if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) {
- url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P );
- url_sb.append( '/' );
- }
- else {
- url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N );
- url_sb.append( '/' );
- }
+ else if ( id.getSource().equals( Source.REFSEQ.toString() ) ) {
+ url_sb.append( EMBL_REFSEQ );
+ // if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) {
+ // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P );
+ // url_sb.append( '/' );
+ // }
+ // else {
+ // url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_N );
+ // url_sb.append( '/' );
+ // }
}
return queryDb( id.getValue(), max_lines_to_return, url_sb.toString() );
}
final Accession acc ) throws IOException {
SequenceDatabaseEntry db_entry = null;
final String query = acc.getValue();
- if ( acc.getSource() == Accession.UNIPROT ) {
+ if ( acc.getSource().equals( Source.UNIPROT.toString() ) ) {
if ( DEBUG ) {
System.out.println( "uniprot: " + query );
}
// Eat this, and move to next.
}
}
- else if ( acc.getSource() == Accession.EMBL ) {
+ else if ( acc.getSource().equals( Source.EMBL.toString() ) ) {
if ( DEBUG ) {
System.out.println( "embl: " + query );
}
// Eat this, and move to next.
}
}
- else if ( acc.getSource() == Accession.REFSEQ ) {
+ else if ( acc.getSource().equals( Source.REFSEQ.toString() ) ) {
if ( DEBUG ) {
System.out.println( "refseq: " + query );
}
private final static boolean isAccessionAcceptable( final Accession acc ) {
return ( !( ( acc == null ) || ForesterUtil.isEmpty( acc.getSource() ) || ForesterUtil.isEmpty( acc.getValue() ) || ( ( acc
- .getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc.getSource() != Accession.REFSEQ ) ) ) );
+ .getSource().equals( Source.UNIPROT.toString() ) )
+ && ( acc.getSource().toString().equals( Source.EMBL.toString() ) ) && ( acc.getSource().toString()
+ .equals( Source.REFSEQ.toString() ) ) ) ) );
}
private static List<UniProtTaxonomy> parseUniProtTaxonomy( final List<String> result ) throws IOException {