import org.forester.phylogeny.data.Taxonomy;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.ForesterUtil;
+import org.forester.util.SequenceIdParser;
import org.forester.ws.uniprot.DatabaseTools;
import org.forester.ws.uniprot.SequenceDatabaseEntry;
import org.forester.ws.uniprot.UniProtWsTools;
private final static boolean DEBUG = false;
private enum Db {
- UNKNOWN, UNIPROT, EMBL;
+ UNKNOWN, UNIPROT, EMBL, NCBI;
}
public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) {
if ( ( query = UniProtWsTools.parseUniProtAccessor( node.getName() ) ) != null ) {
db = Db.UNIPROT;
}
- else if ( ( query = DatabaseTools.parseGenbankAccessor( node.getName() ) ) != null ) {
- db = Db.EMBL;
+ else if ( ( query = SequenceIdParser.parseGenbankAccessor( node.getName() ) ) != null ) {
+ db = Db.NCBI;
}
}
if ( !ForesterUtil.isEmpty( query ) ) {
//Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
//Protein: 3 letters + 5 numerals
//http://www.ncbi.nlm.nih.gov/Sequin/acc.html
- if ( !DatabaseTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
+ if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
return false;
}
- if ( !DatabaseTools.parseGenbankAccessor( ".AY423861." ).equals( "AY423861" ) ) {
+ if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861." ).equals( "AY423861" ) ) {
return false;
}
- if ( DatabaseTools.parseGenbankAccessor( "AAY423861" ) != null ) {
+ if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) {
return false;
}
- if ( DatabaseTools.parseGenbankAccessor( "AY4238612" ) != null ) {
+ if ( SequenceIdParser.parseGenbankAccessor( "AY4238612" ) != null ) {
return false;
}
- if ( DatabaseTools.parseGenbankAccessor( "AAY4238612" ) != null ) {
+ if ( SequenceIdParser.parseGenbankAccessor( "AAY4238612" ) != null ) {
return false;
}
- if ( DatabaseTools.parseGenbankAccessor( "Y423861" ) != null ) {
+ if ( SequenceIdParser.parseGenbankAccessor( "Y423861" ) != null ) {
return false;
}
- if ( !DatabaseTools.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) {
+ if ( !SequenceIdParser.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) {
return false;
}
- if ( !DatabaseTools.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) {
+ if ( !SequenceIdParser.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) {
return false;
}
- if ( DatabaseTools.parseGenbankAccessor( "|S123456" ) != null ) {
+ if ( SequenceIdParser.parseGenbankAccessor( "|S123456" ) != null ) {
return false;
}
- if ( DatabaseTools.parseGenbankAccessor( "ABC123456" ) != null ) {
+ if ( SequenceIdParser.parseGenbankAccessor( "ABC123456" ) != null ) {
return false;
}
- if ( !DatabaseTools.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) {
+ if ( !SequenceIdParser.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) {
return false;
}
- if ( !DatabaseTools.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) {
+ if ( !SequenceIdParser.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) {
return false;
}
- if ( DatabaseTools.parseGenbankAccessor( "ABCD12345" ) != null ) {
+ if ( SequenceIdParser.parseGenbankAccessor( "ABCD12345" ) != null ) {
return false;
}
return true;
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "ADF31344" )
- || !id.getProvider().equals( "genbank" ) ) {
+ || !id.getProvider().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "ADF31344" )
- || !id.getProvider().equals( "genbank" ) ) {
+ || !id.getProvider().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "ADF31344" )
- || !id.getProvider().equals( "genbank" ) ) {
+ || !id.getProvider().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "AAA96518" )
- || !id.getProvider().equals( "genbank" ) ) {
+ || !id.getProvider().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "EHB07727" )
- || !id.getProvider().equals( "genbank" ) ) {
+ || !id.getProvider().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "BAF37827" )
- || !id.getProvider().equals( "genbank" ) ) {
+ || !id.getProvider().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
|| ForesterUtil.isEmpty( id.getValue() )
|| ForesterUtil.isEmpty( id.getProvider() )
|| !id.getValue().equals( "CAA73223" )
- || !id.getProvider().equals( "genbank" ) ) {
+ || !id.getProvider().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
System.out.println( "provider=" + id.getProvider() );
.compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" );\r
private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern\r
.compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );\r
- private final static boolean DEBUG = false;\r
+ private final static boolean DEBUG = true;\r
\r
\r
- \r
+ /**\r
+ * Returns null if no match.\r
+ * \r
+ */\r
public final static Identifier parse( final String s ) {\r
- String v = DatabaseTools.parseGenbankAccessor( s );\r
+ String v = parseGenbankAccessor( s );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Identifier( v, "ncbi" );\r
+ }\r
+ v = parseRefSeqAccessor( s );\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Identifier( v, "genbank" );\r
+ return new Identifier( v, "ncbi" );\r
}\r
- \r
return null;\r
}\r
\r
+ /**\r
+ * Returns null if no match.\r
+ * \r
+ * @param query\r
+ * @param db \r
+ * @return\r
+ */\r
+ static public String parseGenbankAccessor( final String query ) {\r
+ Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ return null;\r
+ }\r
+ }\r
+ }\r
+ }\r
\r
+ public final static String parseRefSeqAccessor( final String query ) {\r
+ Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ return null;\r
+ }\r
+ }\r
+ }\r
+ }\r
\r
\r
\r
package org.forester.ws.uniprot;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
public class DatabaseTools {
- //The format for GenBank Accession numbers are:
- //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
- //Protein: 3 letters + 5 numerals
- //http://www.ncbi.nlm.nih.gov/Sequin/acc.html
- private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
- private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" );
- private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );
- private final static boolean DEBUG = false;
-
- /**
- * Returns null if no match.
- *
- * @param query
- * @param db
- * @return
- */
- static public String parseGenbankAccessor( final String query ) {
- Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );
- if ( m.lookingAt() ) {
- return m.group( 1 );
- }
- else {
- m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );
- if ( m.lookingAt() ) {
- return m.group( 1 );
- }
- else {
- m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );
- if ( m.lookingAt() ) {
- return m.group( 1 );
- }
- else {
- return null;
- }
- }
- }
- }
-
+
static String extract( final String target, final String a, final String b ) {
final int i_a = target.indexOf( a );
final int i_b = target.indexOf( b );