From: cmzmasek@gmail.com Date: Thu, 3 Oct 2013 00:14:28 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=3171b9b28762e02b3fbd9b5f8a5b1946497f2178;p=jalview.git inprogress --- diff --git a/forester/java/src/org/forester/archaeopteryx/TreePanel.java b/forester/java/src/org/forester/archaeopteryx/TreePanel.java index 6d793c8..ad59803 100644 --- a/forester/java/src/org/forester/archaeopteryx/TreePanel.java +++ b/forester/java/src/org/forester/archaeopteryx/TreePanel.java @@ -106,6 +106,7 @@ import org.forester.phylogeny.Phylogeny; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY; import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.Annotation; import org.forester.phylogeny.data.BranchColor; import org.forester.phylogeny.data.Confidence; @@ -127,7 +128,7 @@ import org.forester.util.BasicDescriptiveStatistics; import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; -import org.forester.util.SequenceIdParser; +import org.forester.util.SequenceAccessionTools; public final class TreePanel extends JPanel implements ActionListener, MouseWheelListener, Printable { @@ -2272,7 +2273,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } } if ( type == '?' ) { - if ( SequenceIdParser.isProtein( query ) ) { + if ( SequenceAccessionTools.isProtein( query ) ) { type = 'p'; } else { @@ -2914,17 +2915,11 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } final private String isCanOpenSeqWeb( final PhylogenyNode node ) { - String v = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ); - if ( ForesterUtil.isEmpty( v ) ) { - v = ForesterUtil.extractGenbankAccessor( node ); + final Accession a = SequenceAccessionTools.parse( node ); + if ( a != null ) { + return a.getValue(); } - if ( ForesterUtil.isEmpty( v ) ) { - v = ForesterUtil.extractRefSeqAccessorAccessor( node ); - } - if ( ForesterUtil.isEmpty( v ) ) { - v = ForesterUtil.extractGInumber( node ); - } - return v; + return null; } final private boolean isCanOpenTaxWeb( final PhylogenyNode node ) { diff --git a/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java b/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java index 7c1cbaf..3955c88 100644 --- a/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java @@ -31,7 +31,7 @@ import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.phylogeny.iterators.PreorderTreeIterator; import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; -import org.forester.util.SequenceIdParser; +import org.forester.util.SequenceAccessionTools; import org.forester.ws.seqdb.UniProtTaxonomy; public class TreePanelUtil { @@ -80,7 +80,7 @@ public class TreePanelUtil { final Configuration conf, final TreePanel tp ) { String uri_str = null; - final String upkb = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ); + final String upkb = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node ); if ( !ForesterUtil.isEmpty( upkb ) ) { try { uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 ); @@ -91,10 +91,10 @@ public class TreePanelUtil { } } if ( ForesterUtil.isEmpty( uri_str ) ) { - final String v = ForesterUtil.extractGenbankAccessor( node ); + final String v = SequenceAccessionTools.extractGenbankAccessor( node ); if ( !ForesterUtil.isEmpty( v ) ) { try { - if ( SequenceIdParser.isProtein( v ) ) { + if ( SequenceAccessionTools.isProtein( v ) ) { uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 ); } else { @@ -108,10 +108,10 @@ public class TreePanelUtil { } } if ( ForesterUtil.isEmpty( uri_str ) ) { - final String v = ForesterUtil.extractRefSeqAccessorAccessor( node ); + final String v = SequenceAccessionTools.extractRefSeqAccessor( node ); if ( !ForesterUtil.isEmpty( v ) ) { try { - if ( SequenceIdParser.isProtein( v ) ) { + if ( SequenceAccessionTools.isProtein( v ) ) { uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 ); } else { @@ -125,7 +125,7 @@ public class TreePanelUtil { } } if ( ForesterUtil.isEmpty( uri_str ) ) { - final String v = ForesterUtil.extractGInumber( node ); + final String v = SequenceAccessionTools.extractGInumber( node ); if ( !ForesterUtil.isEmpty( v ) ) { try { uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 ); diff --git a/forester/java/src/org/forester/archaeopteryx/tools/Blast.java b/forester/java/src/org/forester/archaeopteryx/tools/Blast.java index 49e2841..bf3cb49 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/Blast.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/Blast.java @@ -38,9 +38,9 @@ import javax.swing.JApplet; import org.forester.archaeopteryx.AptxUtil; import org.forester.archaeopteryx.TreePanel; import org.forester.phylogeny.PhylogenyNode; -import org.forester.phylogeny.data.Identifier; +import org.forester.phylogeny.data.Accession; import org.forester.util.ForesterUtil; -import org.forester.util.SequenceIdParser; +import org.forester.util.SequenceAccessionTools; import org.forester.ws.wabi.RestUtil; public final class Blast { @@ -81,34 +81,34 @@ public final class Blast { } if ( ForesterUtil.isEmpty( query ) && ( node.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) { - final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getAccession() + final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getAccession() .getValue() ); if ( id != null ) { query = id.getValue(); } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) { - final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() ); + final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() ); if ( id != null ) { query = id.getValue(); } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getSymbol() ) ) { - final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() ); + final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() ); if ( id != null ) { query = id.getValue(); } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getGeneName() ) ) { - final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getGeneName() ); + final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() ); if ( id != null ) { query = id.getValue(); } } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getName() ) ) { - final Identifier id = SequenceIdParser.parse( node.getName() ); + final Accession id = SequenceAccessionTools.parse( node.getName() ); if ( id != null ) { query = id.getValue(); } diff --git a/forester/java/src/org/forester/phylogeny/data/Accession.java b/forester/java/src/org/forester/phylogeny/data/Accession.java index 5fb3afe..8d9739a 100644 --- a/forester/java/src/org/forester/phylogeny/data/Accession.java +++ b/forester/java/src/org/forester/phylogeny/data/Accession.java @@ -34,10 +34,22 @@ import org.forester.util.ForesterUtil; public final class Accession implements PhylogenyData, Comparable { - final private String _comment; - final private String _source; - final private String _source_value; - final private String _value; + final private String _comment; + final private String _source; + final private String _source_value; + final private String _value; + final public static String NCBI = "ncbi"; + final public static String REFSEQ = "refseq"; + final public static String UNIPROT = "uniprot"; + final public static String GI = "gi"; + public static final String EMBL = "embl"; + + public Accession( final String value ) { + _value = value; + _source = ""; + _comment = ""; + _source_value = value; + } public Accession( final String value, final String source ) { _value = value; diff --git a/forester/java/src/org/forester/phylogeny/data/Identifier.java b/forester/java/src/org/forester/phylogeny/data/Identifier.java index 4c3e9b3..39997e6 100644 --- a/forester/java/src/org/forester/phylogeny/data/Identifier.java +++ b/forester/java/src/org/forester/phylogeny/data/Identifier.java @@ -33,12 +33,9 @@ import org.forester.util.ForesterUtil; public final class Identifier implements PhylogenyData { - final public static String NCBI = "ncbi"; - final public static String REFSEQ = "refseq"; - final public static String SP = "sp"; - final private String _value; - final private String _provider; - final private String _value_provider; + final private String _value; + final private String _provider; + final private String _value_provider; public Identifier() { _value = ""; diff --git a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java index 2cfbf0b..26cb433 100644 --- a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java +++ b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java @@ -34,7 +34,7 @@ import java.util.TreeMap; import java.util.TreeSet; import org.forester.util.ForesterUtil; -import org.forester.util.SequenceIdParser; +import org.forester.util.SequenceAccessionTools; class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData { @@ -168,19 +168,19 @@ class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData { private static String obtainSeqLink( final String p ) { String link; - final String up_id = ForesterUtil.extractUniProtKbProteinSeqIdentifier( p ); + final String up_id = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( p ); if ( !ForesterUtil.isEmpty( up_id ) ) { link = "" + up_id + ""; } else { - final String gb_id = SequenceIdParser.parseGenbankProteinAccessor( p ); + final String gb_id = SequenceAccessionTools.parseGenbankProteinAccessor( p ); if ( !ForesterUtil.isEmpty( gb_id ) ) { link = "" + gb_id + ""; } else { - final String gi = SequenceIdParser.parseGInumber( p ); + final String gi = SequenceAccessionTools.parseGInumber( p ); if ( !ForesterUtil.isEmpty( gi ) ) { link = "gi|" + gi + ""; diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index d1b94ed..d397f77 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -114,7 +114,7 @@ import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; import org.forester.util.GeneralTable; -import org.forester.util.SequenceIdParser; +import org.forester.util.SequenceAccessionTools; import org.forester.ws.seqdb.SequenceDatabaseEntry; import org.forester.ws.seqdb.SequenceDbWsTools; import org.forester.ws.seqdb.UniProtTaxonomy; @@ -2504,8 +2504,8 @@ public final class Test { if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) { return false; } - n.setName( "j40f4_Q06891.1_fndn2 fnr3" ); - if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891.1" ) ) { + n.setName( "AAA34956" ); + if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) { return false; } n.setName( "GI:394892" ); @@ -2523,6 +2523,16 @@ public final class Test { System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) ); return false; } + n.setName( "P12345" ); + if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "P12345" ) ) { + System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) ); + return false; + } + n.setName( "gi_fdgjmn-3jk5-243 mnefmn fg023-0 P12345 4395jtmnsrg02345m1ggi92450jrg890j4t0j240" ); + if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "P12345" ) ) { + System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) ); + return false; + } } catch ( final Exception e ) { e.printStackTrace( System.out ); @@ -3329,46 +3339,46 @@ public final class Test { //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals //Protein: 3 letters + 5 numerals //http://www.ncbi.nlm.nih.gov/Sequin/acc.html - if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) { return false; } - if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) { return false; } - if ( !SequenceIdParser.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) { return false; } - if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessor( "AAY423861" ) != null ) { return false; } - if ( SequenceIdParser.parseGenbankAccessor( "AY4238612" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessor( "AY4238612" ) != null ) { return false; } - if ( SequenceIdParser.parseGenbankAccessor( "AAY4238612" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessor( "AAY4238612" ) != null ) { return false; } - if ( SequenceIdParser.parseGenbankAccessor( "Y423861" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessor( "Y423861" ) != null ) { return false; } - if ( !SequenceIdParser.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) { return false; } - if ( !SequenceIdParser.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) { return false; } - if ( SequenceIdParser.parseGenbankAccessor( "|S123456" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessor( "|S123456" ) != null ) { return false; } - if ( SequenceIdParser.parseGenbankAccessor( "ABC123456" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessor( "ABC123456" ) != null ) { return false; } - if ( !SequenceIdParser.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) { return false; } - if ( !SequenceIdParser.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) { return false; } - if ( SequenceIdParser.parseGenbankAccessor( "ABCD12345" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessor( "ABCD12345" ) != null ) { return false; } return true; @@ -3662,166 +3672,166 @@ public final class Test { try { PhylogenyNode n = new PhylogenyNode(); n.setName( "tr|B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr.B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr=B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr-B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr/B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr\\B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr_B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( " tr|B3RJ64 " ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "-tr|B3RJ64-" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "-tr=B3RJ64-" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "_tr=B3RJ64_" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( " tr_tr|B3RJ64_sp|123 " ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } - n.setName( "sp|B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + n.setName( "B3RJ64" ); + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } - n.setName( "ssp|B3RJ64" ); - if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + n.setName( "sp|B3RJ64" ); + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "sp|B3RJ64C" ); - if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { return false; } n.setName( "sp B3RJ64" ); - if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "sp|B3RJ6X" ); - if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { return false; } n.setName( "sp|B3RJ6" ); - if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { return false; } n.setName( "K1PYK7_CRAGI" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } n.setName( "K1PYK7_PEA" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) { return false; } n.setName( "K1PYK7_RAT" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) { return false; } n.setName( "K1PYK7_PIG" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { return false; } n.setName( "~K1PYK7_PIG~" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { return false; } n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } n.setName( "K1PYKX_CRAGI" ); - if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { return false; } n.setName( "XXXXX_CRAGI" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) { return false; } n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) { return false; } n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" ); - if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { return false; } n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) { return false; } n = new PhylogenyNode(); org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence(); seq.setSymbol( "K1PYK7_CRAGI" ); n.getNodeData().addSequence( seq ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } seq.setSymbol( "tr|B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n = new PhylogenyNode(); seq = new org.forester.phylogeny.data.Sequence(); seq.setName( "K1PYK7_CRAGI" ); n.getNodeData().addSequence( seq ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } seq.setName( "tr|B3RJ64" ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } n = new PhylogenyNode(); seq = new org.forester.phylogeny.data.Sequence(); seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) ); n.getNodeData().addSequence( seq ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) { return false; } n = new PhylogenyNode(); seq = new org.forester.phylogeny.data.Sequence(); seq.setAccession( new Accession( "tr|B3RJ64", "?" ) ); n.getNodeData().addSequence( seq ); - if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { return false; } // n = new PhylogenyNode(); n.setName( "ACP19736" ); - if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { + if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { return false; } n = new PhylogenyNode(); n.setName( "_ACP19736_" ); - if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { + if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { return false; } } @@ -9620,120 +9630,120 @@ public final class Test { private static boolean testSequenceIdParsing() { try { - Identifier id = SequenceIdParser.parse( "gb_ADF31344_segmented_worms_" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) { + Accession id = SequenceAccessionTools.parse( "gb_ADF31344_segmented_worms_" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "segmented worms|gb_ADF31344" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) { + id = SequenceAccessionTools.parse( "segmented worms|gb_ADF31344" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "segmented worms gb_ADF31344 and more" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) { + id = SequenceAccessionTools.parse( "segmented worms gb_ADF31344 and more" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "gb_AAA96518_1" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "AAA96518" ) || !id.getProvider().equals( "ncbi" ) ) { + id = SequenceAccessionTools.parse( "gb_AAA96518_1" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "AAA96518" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "gb_EHB07727_1_rodents_" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "EHB07727" ) || !id.getProvider().equals( "ncbi" ) ) { + id = SequenceAccessionTools.parse( "gb_EHB07727_1_rodents_" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "EHB07727" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "dbj_BAF37827_1_turtles_" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "BAF37827" ) || !id.getProvider().equals( "ncbi" ) ) { + id = SequenceAccessionTools.parse( "dbj_BAF37827_1_turtles_" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "BAF37827" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "emb_CAA73223_1_primates_" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "CAA73223" ) || !id.getProvider().equals( "ncbi" ) ) { + id = SequenceAccessionTools.parse( "emb_CAA73223_1_primates_" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "CAA73223" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "mites|ref_XP_002434188_1" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "XP_002434188" ) || !id.getProvider().equals( "refseq" ) ) { + id = SequenceAccessionTools.parse( "mites|ref_XP_002434188_1" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "mites_ref_XP_002434188_1_bla_XP_12345" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "XP_002434188" ) || !id.getProvider().equals( "refseq" ) ) { + id = SequenceAccessionTools.parse( "mites_ref_XP_002434188_1_bla_XP_12345" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "P4A123" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "P4A123" ) || !id.getProvider().equals( "sp" ) ) { + id = SequenceAccessionTools.parse( "P4A123" ); + if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "uniprot" ) ) { if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); } return false; } // - id = SequenceIdParser.parse( "pllf[pok P4A123_osdjfosnqo035-9233332904i000490 vf tmv x45" ); - if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() ) - || !id.getValue().equals( "P4A123" ) || !id.getProvider().equals( "sp" ) ) { - if ( id != null ) { - System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); - } - return false; - } + // id = SequenceAccessionTools.parse( "pllf[pok P4A123_osdjfosnqo035-9233332904i000490 vf tmv x45" ); + // if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) + // || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "sp" ) ) { + // if ( id != null ) { + // System.out.println( "value =" + id.getValue() ); + // System.out.println( "provider=" + id.getSource() ); + // } + // return false; + // } // - id = SequenceIdParser.parse( "XP_12345" ); + id = SequenceAccessionTools.parse( "XP_12345" ); if ( id != null ) { System.out.println( "value =" + id.getValue() ); - System.out.println( "provider=" + id.getProvider() ); + System.out.println( "provider=" + id.getSource() ); return false; } // lcl_91970_unknown_ diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index aed217b..43700ef 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -87,10 +87,6 @@ public final class ForesterUtil { public static final String NCBI_PROTEIN = "http://www.ncbi.nlm.nih.gov/protein/"; public static final String NCBI_NUCCORE = "http://www.ncbi.nlm.nih.gov/nuccore/"; public final static String UNIPROT_KB = "http://www.uniprot.org/uniprot/"; - public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern - .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); - public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern - .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" ); public static final String NCBI_GI = "http://www.ncbi.nlm.nih.gov/protein/gi:"; static { final DecimalFormatSymbols dfs = new DecimalFormatSymbols(); @@ -105,157 +101,6 @@ public final class ForesterUtil { private ForesterUtil() { } - public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( !isEmpty( seq.getSymbol() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() ); - } - if ( !isEmpty( seq.getGeneName() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( seq.getGeneName() ); - } - if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( seq.getName() ); - } - if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() ); - } - } - if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { - v = SequenceIdParser.parseRefSeqAccessor( node.getName() ); - } - return v; - } - - public static String extractGenbankAccessor( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( !isEmpty( seq.getSymbol() ) ) { - v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() ); - } - if ( !isEmpty( seq.getGeneName() ) ) { - v = SequenceIdParser.parseGenbankAccessor( seq.getGeneName() ); - } - if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { - v = SequenceIdParser.parseGenbankAccessor( seq.getName() ); - } - if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() ); - } - } - if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { - v = SequenceIdParser.parseGenbankAccessor( node.getName() ); - } - return v; - } - - public static String extractGInumber( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { - v = SequenceIdParser.parseGInumber( seq.getName() ); - } - if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() ); - } - } - if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { - v = SequenceIdParser.parseGInumber( node.getName() ); - } - return v; - } - - public static String extractUniProtKbProteinSeqIdentifier( final String str ) { - String upkb = null; - Matcher m = UNIPROT_KB_PATTERN_1.matcher( str ); - if ( m.find() ) { - upkb = m.group( 1 ); - } - else { - m = UNIPROT_KB_PATTERN_2.matcher( str ); - if ( m.find() ) { - upkb = m.group(); - } - } - return upkb; - } - - public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) { - String upkb = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - Matcher m; - if ( !isEmpty( seq.getSymbol() ) ) { - m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() ); - if ( m.find() ) { - upkb = m.group( 1 ); - } - else { - m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) { - m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() ); - if ( m.find() ) { - upkb = m.group( 1 ); - } - else { - m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - if ( isEmpty( upkb ) && !isEmpty( seq.getGeneName() ) ) { - m = UNIPROT_KB_PATTERN_1.matcher( seq.getGeneName() ); - if ( m.find() ) { - upkb = m.group( 1 ); - } - else { - m = UNIPROT_KB_PATTERN_2.matcher( seq.getGeneName() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !isEmpty( seq.getAccession().getValue() ) ) { - m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() ); - if ( m.find() ) { - upkb = m.group( 1 ); - } - else { - m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - } - if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) { - final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() ); - if ( m1.find() ) { - upkb = m1.group( 1 ); - } - else { - final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() ); - if ( m2.find() ) { - upkb = m2.group(); - } - } - } - return upkb; - } - final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) { if ( sb.length() > 0 ) { sb.append( separator ); diff --git a/forester/java/src/org/forester/util/SequenceAccessionTools.java b/forester/java/src/org/forester/util/SequenceAccessionTools.java new file mode 100644 index 0000000..4136049 --- /dev/null +++ b/forester/java/src/org/forester/util/SequenceAccessionTools.java @@ -0,0 +1,322 @@ +// $Id: +// FORESTER -- software libraries and applications +// for evolutionary biology research and applications. +// +// Copyright (C) 2008-2009 Christian M. Zmasek +// Copyright (C) 2008-2009 Burnham Institute for Medical Research +// Copyright (C) 2000-2001 Washington University School of Medicine +// and Howard Hughes Medical Institute +// Copyright (C) 2003-2007 Ethalinda K.S. Cannon +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: https://sites.google.com/site/cmzmasek/home/software/forester + +package org.forester.util; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.forester.phylogeny.PhylogenyNode; +import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Sequence; + +public final class SequenceAccessionTools { + + public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern + .compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" ); + public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern + .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); + public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern + .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" ); + // gb_ADF31344_1_segmented_worms_ + // gb_AAA96518_1 + // gb_EHB07727_1_rodents_ + // dbj_BAF37827_1_turtles_ + // emb_CAA73223_1_primates_ + // lcl_91970_unknown_ + // mites|ref_XP_002434188_1 + // ref_XP_002434188_1_mites___ticks_ + // ref_NP_001121530_1_frogs___toads_ + //The format for GenBank Accession numbers are: + //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals + //Protein: 3 letters + 5 numerals + //http://www.ncbi.nlm.nih.gov/Sequin/acc.html + private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); + private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); + private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); + private final static Pattern GI_PATTERN = Pattern + .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" ); + // RefSeq accession numbers can be distinguished from GenBank accessions + // by their distinct prefix format of 2 characters followed by an + // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. + private final static Pattern REFSEQ_PATTERN = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" ); + + private SequenceAccessionTools() { + // Hiding the constructor. + } + + public static String extractGenbankAccessor( final PhylogenyNode node ) { + String v = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { + v = parseGenbankAccessor( seq.getSymbol() ); + } + if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) { + v = parseGenbankAccessor( seq.getGeneName() ); + } + if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) { + v = parseGenbankAccessor( seq.getName() ); + } + if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { + v = parseGenbankAccessor( seq.getAccession().getValue() ); + } + } + if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) { + v = parseGenbankAccessor( node.getName() ); + } + return v; + } + + public static String extractGInumber( final PhylogenyNode node ) { + String v = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) { + v = parseGInumber( seq.getName() ); + } + if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { + v = parseGInumber( seq.getAccession().getValue() ); + } + } + if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) { + v = parseGInumber( node.getName() ); + } + return v; + } + + public static String extractRefSeqAccessor( final PhylogenyNode node ) { + String v = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { + v = parseRefSeqAccessor( seq.getSymbol() ); + } + if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) { + v = parseRefSeqAccessor( seq.getGeneName() ); + } + if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) { + v = parseRefSeqAccessor( seq.getName() ); + } + if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { + v = parseRefSeqAccessor( seq.getAccession().getValue() ); + } + } + if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) { + v = parseRefSeqAccessor( node.getName() ); + } + return v; + } + + public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) { + String a = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { + a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getSymbol() ); + } + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) { + a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getName() ); + } + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) { + a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getGeneName() ); + } + if ( ForesterUtil.isEmpty( a ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { + a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getAccession().getValue() ); + } + } + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( node.getName() ) ) { + a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node.getName() ); + } + return a; + } + + public static String extractUniProtKbProteinSeqIdentifier( final String str ) { + Matcher m = UNIPROT_KB_PATTERN_0.matcher( str ); + if ( m.find() ) { + return m.group( 1 ); + } + m = UNIPROT_KB_PATTERN_1.matcher( str ); + if ( m.find() ) { + return m.group( 1 ); + } + m = UNIPROT_KB_PATTERN_2.matcher( str ); + if ( m.find() ) { + return m.group(); + } + return null; + } + + public final static boolean isProtein( final String query ) { + final String r1 = parseRefSeqAccessor( query ); + if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) { + return true; + } + final String r2 = extractUniProtKbProteinSeqIdentifier( query ); + if ( !ForesterUtil.isEmpty( r2 ) ) { + return true; + } + return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt(); + } + + public final static Accession parse( final PhylogenyNode n ) { + String v = extractUniProtKbProteinSeqIdentifier( n ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.UNIPROT ); + } + v = extractGenbankAccessor( n ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.NCBI ); + } + v = extractRefSeqAccessor( n ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.REFSEQ ); + } + v = extractGInumber( n ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.GI ); + } + return null; + } + + public final static Accession obtainFromSeqAccession( final PhylogenyNode node ) { + if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) { + final String source = node.getNodeData().getSequence().getAccession().getSource().toLowerCase(); + final String value = node.getNodeData().getSequence().getAccession().getValue(); + if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source + .equals( "sp" ) ) ) { + return new Accession( value, Accession.UNIPROT ); + } + else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) { + return new Accession( value, Accession.EMBL ); + } + else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) { + return new Accession( value, Accession.NCBI ); + } + else if ( source.equals( "refseq" ) ) { + return new Accession( value, Accession.REFSEQ ); + } + else if ( source.equals( "gi" ) ) { + return new Accession( value, Accession.GI ); + } + } + return null; + } + + /** + * Returns null if no match. + * + */ + public final static Accession parse( final String s ) { + if ( !ForesterUtil.isEmpty( s ) ) { + String v = extractUniProtKbProteinSeqIdentifier( s ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.UNIPROT ); + } + v = parseGenbankAccessor( s ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.NCBI ); + } + v = parseRefSeqAccessor( s ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.REFSEQ ); + } + v = parseGInumber( s ); + if ( !ForesterUtil.isEmpty( v ) ) { + return new Accession( v, Accession.GI ); + } + } + return null; + } + + /** + * Returns null if no match. + * + */ + public static String parseGenbankAccessor( final String query ) { + Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + } + } + + public static String parseGenbankProteinAccessor( final String query ) { + final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + else { + return null; + } + } + + public static String parseGInumber( final String query ) { + final Matcher m = GI_PATTERN.matcher( query ); + if ( m.find() ) { + return m.group( 1 ); + } + return null; + } + + /** + * Returns null if no match. + * + */ + public final static String parseRefSeqAccessor( final String query ) { + final Matcher m = REFSEQ_PATTERN.matcher( query ); + if ( m.lookingAt() ) { + return m.group( 1 ); + } + return null; + } +} diff --git a/forester/java/src/org/forester/util/SequenceIdParser.java b/forester/java/src/org/forester/util/SequenceIdParser.java deleted file mode 100644 index 8fcf6ee..0000000 --- a/forester/java/src/org/forester/util/SequenceIdParser.java +++ /dev/null @@ -1,173 +0,0 @@ -// $Id: -// FORESTER -- software libraries and applications -// for evolutionary biology research and applications. -// -// Copyright (C) 2008-2009 Christian M. Zmasek -// Copyright (C) 2008-2009 Burnham Institute for Medical Research -// Copyright (C) 2000-2001 Washington University School of Medicine -// and Howard Hughes Medical Institute -// Copyright (C) 2003-2007 Ethalinda K.S. Cannon -// All rights reserved -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA -// -// Contact: phylosoft @ gmail . com -// WWW: https://sites.google.com/site/cmzmasek/home/software/forester - -package org.forester.util; - -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.forester.phylogeny.data.Identifier; - -public final class SequenceIdParser { - - // gb_ADF31344_1_segmented_worms_ - // gb_AAA96518_1 - // gb_EHB07727_1_rodents_ - // dbj_BAF37827_1_turtles_ - // emb_CAA73223_1_primates_ - // lcl_91970_unknown_ - // mites|ref_XP_002434188_1 - // ref_XP_002434188_1_mites___ticks_ - // ref_NP_001121530_1_frogs___toads_ - //The format for GenBank Accession numbers are: - //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals - //Protein: 3 letters + 5 numerals - //http://www.ncbi.nlm.nih.gov/Sequin/acc.html - private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); - // RefSeq accession numbers can be distinguished from GenBank accessions - // by their distinct prefix format of 2 characters followed by an - // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. - private final static Pattern REFSEQ_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" ); - // See: http://web.expasy.org/docs/userman.html#ID_line - private final static Pattern TREMBL_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z][0-9][A-Z0-9]{3}[0-9])(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GI_PATTERN = Pattern - .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" ); - - /** - * Returns null if no match. - * - */ - public final static Identifier parse( final String s ) { - if ( !ForesterUtil.isEmpty( s ) ) { - String v = parseGenbankAccessor( s ); - if ( !ForesterUtil.isEmpty( v ) ) { - return new Identifier( v, Identifier.NCBI ); - } - v = parseRefSeqAccessor( s ); - if ( !ForesterUtil.isEmpty( v ) ) { - return new Identifier( v, Identifier.REFSEQ ); - } - v = parseTrEMBLAccessor( s ); - if ( !ForesterUtil.isEmpty( v ) ) { - return new Identifier( v, Identifier.SP ); - } - } - return null; - } - - public final static boolean isProtein( final String query ) { - final String r1 = parseRefSeqAccessor( query ); - if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) { - return true; - } - final String r2 = parseTrEMBLAccessor( query ); - if ( !ForesterUtil.isEmpty( r2 ) ) { - return true; - } - return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt(); - } - - /** - * Returns null if no match. - * - */ - public static String parseGenbankAccessor( final String query ) { - Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } - } - } - } - - public static String parseGenbankProteinAccessor( final String query ) { - final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - else { - return null; - } - } - - /** - * Returns null if no match. - * - */ - public final static String parseRefSeqAccessor( final String query ) { - final Matcher m = REFSEQ_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - return null; - } - - /** - * Returns null if no match. - * - */ - private final static String parseTrEMBLAccessor( final String query ) { - final Matcher m = TREMBL_PATTERN.matcher( query ); - if ( m.lookingAt() ) { - return m.group( 1 ); - } - return null; - } - - private SequenceIdParser() { - // Hiding the constructor. - } - - public static String parseGInumber( final String query ) { - final Matcher m = GI_PATTERN.matcher( query ); - if ( m.find() ) { - return m.group( 1 ); - } - return null; - } -} diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index f5f83e4..c40b37e 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -48,7 +48,7 @@ import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; -import org.forester.util.SequenceIdParser; +import org.forester.util.SequenceAccessionTools; public final class SequenceDbWsTools { @@ -137,13 +137,13 @@ public final class SequenceDbWsTools { return null; } - public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return ) + public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return ) throws IOException { final List lines = queryEmblDb( id, max_lines_to_return ); return EbiDbEntry.createInstanceFromPlainText( lines ); } - public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return ) + public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return ) throws IOException { final List lines = queryEmblDb( id, max_lines_to_return ); return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines ); @@ -159,188 +159,176 @@ public final class SequenceDbWsTools { if ( ext_nodes_only && node.isInternal() ) { continue; } - String query = null; - Identifier id = null; - Db db = Db.NONE; - if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "uniprot" ) - || node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "swissprot" ) - || node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "trembl" ) - || node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - .startsWith( "sp" ) || node.getNodeData().getSequence().getAccession().getValue() - .toLowerCase().startsWith( "uniprotkb" ) ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.UNIPROT; + // String query = null; + // Accession id = null; + // Accession acc = SequenceAccessionTools.obtain( node ); + // + // + // Db db = Db.NONE; + // if ( node.getNodeData().isHasSequence() + // && ( node.getNodeData().getSequence().getAccession() != null ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + // && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase() + // .startsWith( "uniprot" ) + // || node.getNodeData().getSequence().getAccession().getValue() + // .equalsIgnoreCase( "swissprot" ) + // || node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "trembl" ) || node + // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "sp" ) ) ) { + // query = node.getNodeData().getSequence().getAccession().getValue(); + // db = Db.UNIPROT; + // } + // else if ( node.getNodeData().isHasSequence() + // && ( node.getNodeData().getSequence().getAccession() != null ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + // && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "embl" ) || node + // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ebi" ) ) ) { + // query = node.getNodeData().getSequence().getAccession().getValue(); + // db = Db.EMBL; + // } + // else if ( node.getNodeData().isHasSequence() + // && ( node.getNodeData().getSequence().getAccession() != null ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + // && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ncbi" ) || node + // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "genbank" ) ) ) { + // query = node.getNodeData().getSequence().getAccession().getValue(); + // // db = Db.NCBI; + // } + // else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + // && node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "refseq" ) ) { + // query = node.getNodeData().getSequence().getAccession().getValue(); + // db = Db.REFSEQ; + // } + // else { + Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node ); + // if ( ( query = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) { + // db = Db.UNIPROT; + // } + // else if ( node.getNodeData().isHasSequence() ) { + // if ( ( id = SequenceAccessionTools.parse( node.getName() ) ) != null ) { + // if ( id.getSource() == Accession.NCBI ) { + // // db = Db.NCBI; + // } + // else if ( id.getSource() == Accession.REFSEQ ) { + // db = Db.REFSEQ; + // } + // } + // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() ) ) != null ) { + // if ( id.getSource() == Accession.NCBI ) { + // // = Db.NCBI; + // } + // else if ( id.getSource() == Accession.REFSEQ ) { + // db = Db.REFSEQ; + // } + // } + // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) { + // if ( id.getSource() == Accession.NCBI ) { + // // db = Db.NCBI; + // } + // else if ( id.getSource() == Accession.REFSEQ ) { + // db = Db.REFSEQ; + // } + // } + // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) { + // if ( id.getSource() == Accession.NCBI ) { + // // db = Db.NCBI; + // } + // else if ( id.getSource() == Accession.REFSEQ ) { + // db = Db.REFSEQ; + // } + // } + // } + // } + if ( ( acc == null ) + || ForesterUtil.isEmpty( acc.getSource() ) + || ForesterUtil.isEmpty( acc.getValue() ) + || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc + .getSource() != Accession.REFSEQ ) ) ) { + acc = SequenceAccessionTools.parse( node ); } - else if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) || node - .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" ) ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.EMBL; - } - else if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ncbi" ) || node - .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "genbank" ) ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - // db = Db.NCBI; - } - else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "refseq" ) ) { - query = node.getNodeData().getSequence().getAccession().getValue(); - db = Db.REFSEQ; - } - else { - if ( ( query = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) { - db = Db.UNIPROT; - } - else if ( node.getNodeData().isHasSequence() ) { - if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // db = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // db = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) { - if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { - // db = Db.NCBI; - } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { - db = Db.REFSEQ; - } - } - } - } - if ( db == Db.NONE ) { + if ( ( acc == null ) + || ForesterUtil.isEmpty( acc.getSource() ) + || ForesterUtil.isEmpty( acc.getValue() ) + || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc + .getSource() != Accession.REFSEQ ) ) ) { not_found.add( node.toString() ); } - SequenceDatabaseEntry db_entry = null; - if ( !ForesterUtil.isEmpty( query ) ) { - if ( db == Db.UNIPROT ) { + else { + SequenceDatabaseEntry db_entry = null; + final String query = acc.getValue(); + if ( acc.getSource() == Accession.UNIPROT ) { if ( DEBUG ) { System.out.println( "uniprot: " + query ); } db_entry = obtainUniProtEntry( query, lines_to_return ); } - else if ( db == Db.EMBL ) { + else if ( acc.getSource() == Accession.EMBL ) { if ( DEBUG ) { System.out.println( "embl: " + query ); } - db_entry = obtainEmblEntry( new Identifier( query ), lines_to_return ); + db_entry = obtainEmblEntry( new Accession( query ), lines_to_return ); } - else if ( db == Db.REFSEQ ) { + else if ( acc.getSource() == Accession.REFSEQ ) { if ( DEBUG ) { System.out.println( "refseq: " + query ); } - db_entry = obtainRefSeqEntryFromEmbl( new Identifier( query ), lines_to_return ); + db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); } - // else if ( db == Db.NCBI ) { - // if ( DEBUG ) { - // System.out.println( "ncbi: " + query ); - // } - // db_entry = obtainNcbiEntry( new Identifier( query ), lines_to_return ); - // } - } - else if ( ( db == Db.REFSEQ ) && ( id != null ) ) { - db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return ); - } - //else if ( ( db == Db.NCBI ) && ( id != null ) ) { - // db_entry = obtainNcbiEntry( id, lines_to_return ); - //} - if ( ( db_entry != null ) && !db_entry.isEmpty() ) { - final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() - : new Sequence(); - if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { - String type = null; - if ( db == Db.EMBL ) { - type = "embl"; + if ( ( db_entry != null ) && !db_entry.isEmpty() ) { + final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() + : new Sequence(); + if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { + seq.setAccession( new Accession( db_entry.getAccession(), acc.getSource() ) ); } - else if ( db == Db.UNIPROT ) { - type = "uniprot"; + if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { + seq.setName( db_entry.getSequenceName() ); } - // else if ( db == Db.NCBI ) { - // type = "ncbi"; - // } - else if ( db == Db.REFSEQ ) { - type = "refseq"; + if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) { + seq.setGeneName( db_entry.getGeneName() ); } - seq.setAccession( new Accession( db_entry.getAccession(), type ) ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { - seq.setName( db_entry.getSequenceName() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) { - seq.setGeneName( db_entry.getGeneName() ); - } - if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { - try { - seq.setSymbol( db_entry.getSequenceSymbol() ); + if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { + try { + seq.setSymbol( db_entry.getSequenceSymbol() ); + } + catch ( final PhyloXmlDataFormatException e ) { + // Eat this exception. + } } - catch ( final PhyloXmlDataFormatException e ) { - // Eat this exception. + if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) { + for( final GoTerm go : db_entry.getGoTerms() ) { + final Annotation ann = new Annotation( go.getGoId().getId() ); + ann.setDesc( go.getName() ); + seq.addAnnotation( ann ); + } } - } - if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) { - for( final GoTerm go : db_entry.getGoTerms() ) { - final Annotation ann = new Annotation( go.getGoId().getId() ); - ann.setDesc( go.getName() ); - seq.addAnnotation( ann ); + if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) { + for( final Accession x : db_entry.getCrossReferences() ) { + seq.addCrossReference( x ); + } } - } - if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) { - for( final Accession x : db_entry.getCrossReferences() ) { - seq.addCrossReference( x ); + final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() + : new Taxonomy(); + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { + tax.setScientificName( db_entry.getTaxonomyScientificName() ); + } + if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { + tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); } + node.getNodeData().setTaxonomy( tax ); + node.getNodeData().setSequence( seq ); } - final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy() - : new Taxonomy(); - if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { - tax.setScientificName( db_entry.getTaxonomyScientificName() ); + else { + not_found.add( node.getName() ); } - if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { - tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); + try { + Thread.sleep( 10 );// Sleep for 10 ms + } + catch ( final InterruptedException ie ) { } - node.getNodeData().setTaxonomy( tax ); - node.getNodeData().setSequence( seq ); - } - else if ( db != Db.NONE ) { - not_found.add( node.getName() ); - } - try { - Thread.sleep( 10 );// Sleep for 10 ms - } - catch ( final InterruptedException ie ) { } } return not_found; @@ -388,14 +376,14 @@ public final class SequenceDbWsTools { return result; } - public static List queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException { + public static List queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException { final StringBuilder url_sb = new StringBuilder(); url_sb.append( BASE_EMBL_DB_URL ); - if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) { + if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource() == Accession.NCBI ) ) { url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL ); url_sb.append( '/' ); } - else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) { + else if ( id.getSource() == Accession.REFSEQ ) { if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) { url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P ); url_sb.append( '/' ); @@ -459,8 +447,4 @@ public final class SequenceDbWsTools { } return taxonomies; } - - public enum Db { - UNIPROT, EMBL, NCBI, NONE, REFSEQ; - } }