From: cmzmasek@gmail.com Date: Thu, 3 Oct 2013 01:22:16 +0000 (+0000) Subject: inprogress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=397137f4f105810096dc6034ccc1e1013ed11644;p=jalview.git inprogress --- diff --git a/forester/java/src/org/forester/archaeopteryx/TreePanel.java b/forester/java/src/org/forester/archaeopteryx/TreePanel.java index ad59803..88c4d66 100644 --- a/forester/java/src/org/forester/archaeopteryx/TreePanel.java +++ b/forester/java/src/org/forester/archaeopteryx/TreePanel.java @@ -2273,7 +2273,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } } if ( type == '?' ) { - if ( SequenceAccessionTools.isProtein( query ) ) { + if ( SequenceAccessionTools.isProteinDbQuery( query ) ) { type = 'p'; } else { @@ -2915,7 +2915,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } final private String isCanOpenSeqWeb( final PhylogenyNode node ) { - final Accession a = SequenceAccessionTools.parse( node ); + final Accession a = SequenceAccessionTools.obtainAccessorFromDataFields( node ); if ( a != null ) { return a.getValue(); } diff --git a/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java b/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java index 3955c88..61db054 100644 --- a/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/TreePanelUtil.java @@ -80,7 +80,7 @@ public class TreePanelUtil { final Configuration conf, final TreePanel tp ) { String uri_str = null; - final String upkb = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node ); + final String upkb = SequenceAccessionTools.obtainUniProtAccessorFromDataFields( node ); if ( !ForesterUtil.isEmpty( upkb ) ) { try { uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 ); @@ -91,10 +91,10 @@ public class TreePanelUtil { } } if ( ForesterUtil.isEmpty( uri_str ) ) { - final String v = SequenceAccessionTools.extractGenbankAccessor( node ); + final String v = SequenceAccessionTools.obtainGenbankAccessorFromDataFields( node ); if ( !ForesterUtil.isEmpty( v ) ) { try { - if ( SequenceAccessionTools.isProtein( v ) ) { + if ( SequenceAccessionTools.isProteinDbQuery( v ) ) { uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 ); } else { @@ -108,10 +108,10 @@ public class TreePanelUtil { } } if ( ForesterUtil.isEmpty( uri_str ) ) { - final String v = SequenceAccessionTools.extractRefSeqAccessor( node ); + final String v = SequenceAccessionTools.obtainRefSeqAccessorFromDataFields( node ); if ( !ForesterUtil.isEmpty( v ) ) { try { - if ( SequenceAccessionTools.isProtein( v ) ) { + if ( SequenceAccessionTools.isProteinDbQuery( v ) ) { uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 ); } else { @@ -125,7 +125,7 @@ public class TreePanelUtil { } } if ( ForesterUtil.isEmpty( uri_str ) ) { - final String v = SequenceAccessionTools.extractGInumber( node ); + final String v = SequenceAccessionTools.obtainGiNumberFromDataFields( node ); if ( !ForesterUtil.isEmpty( v ) ) { try { uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 ); diff --git a/forester/java/src/org/forester/archaeopteryx/tools/Blast.java b/forester/java/src/org/forester/archaeopteryx/tools/Blast.java index bf3cb49..72c23c1 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/Blast.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/Blast.java @@ -81,34 +81,34 @@ public final class Blast { } if ( ForesterUtil.isEmpty( query ) && ( node.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) { - final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getAccession() + final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getAccession() .getValue() ); if ( id != null ) { query = id.getValue(); } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) { - final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() ); + final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getName() ); if ( id != null ) { query = id.getValue(); } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getSymbol() ) ) { - final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() ); + final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getSymbol() ); if ( id != null ) { query = id.getValue(); } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getGeneName() ) ) { - final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() ); + final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getNodeData().getSequence().getGeneName() ); if ( id != null ) { query = id.getValue(); } } } if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getName() ) ) { - final Accession id = SequenceAccessionTools.parse( node.getName() ); + final Accession id = SequenceAccessionTools.parseAccessorFromString( node.getName() ); if ( id != null ) { query = id.getValue(); } diff --git a/forester/java/src/org/forester/phylogeny/PhylogenyNode.java b/forester/java/src/org/forester/phylogeny/PhylogenyNode.java index 05876e7..01416eb 100644 --- a/forester/java/src/org/forester/phylogeny/PhylogenyNode.java +++ b/forester/java/src/org/forester/phylogeny/PhylogenyNode.java @@ -83,6 +83,9 @@ public final class PhylogenyNode implements Comparable { _descendants = null; } + public boolean isEmpty() { + } + /** * Adds PhylogenyNode n to the list of child nodes and sets the _parent of n * to this. diff --git a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java index 26cb433..6c6e6fd 100644 --- a/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java +++ b/forester/java/src/org/forester/surfacing/PrintableSpeciesSpecificDcData.java @@ -168,19 +168,19 @@ class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData { private static String obtainSeqLink( final String p ) { String link; - final String up_id = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( p ); + final String up_id = SequenceAccessionTools.parseUniProtAccessorFromString( p ); if ( !ForesterUtil.isEmpty( up_id ) ) { link = "" + up_id + ""; } else { - final String gb_id = SequenceAccessionTools.parseGenbankProteinAccessor( p ); + final String gb_id = SequenceAccessionTools.parseGenbankProteinAccessorFromString( p ); if ( !ForesterUtil.isEmpty( gb_id ) ) { link = "" + gb_id + ""; } else { - final String gi = SequenceAccessionTools.parseGInumber( p ); + final String gi = SequenceAccessionTools.parseGInumberFromString( p ); if ( !ForesterUtil.isEmpty( gi ) ) { link = "gi|" + gi + ""; diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index d397f77..61b3d3d 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -3339,46 +3339,46 @@ public final class Test { //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals //Protein: 3 letters + 5 numerals //http://www.ncbi.nlm.nih.gov/Sequin/acc.html - if ( !SequenceAccessionTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "AY423861" ).equals( "AY423861" ) ) { return false; } - if ( !SequenceAccessionTools.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessorFromString( ".AY423861.2" ).equals( "AY423861.2" ) ) { return false; } - if ( !SequenceAccessionTools.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) { return false; } - if ( SequenceAccessionTools.parseGenbankAccessor( "AAY423861" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessorFromString( "AAY423861" ) != null ) { return false; } - if ( SequenceAccessionTools.parseGenbankAccessor( "AY4238612" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessorFromString( "AY4238612" ) != null ) { return false; } - if ( SequenceAccessionTools.parseGenbankAccessor( "AAY4238612" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessorFromString( "AAY4238612" ) != null ) { return false; } - if ( SequenceAccessionTools.parseGenbankAccessor( "Y423861" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessorFromString( "Y423861" ) != null ) { return false; } - if ( !SequenceAccessionTools.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "S12345" ).equals( "S12345" ) ) { return false; } - if ( !SequenceAccessionTools.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "|S12345|" ).equals( "S12345" ) ) { return false; } - if ( SequenceAccessionTools.parseGenbankAccessor( "|S123456" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessorFromString( "|S123456" ) != null ) { return false; } - if ( SequenceAccessionTools.parseGenbankAccessor( "ABC123456" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessorFromString( "ABC123456" ) != null ) { return false; } - if ( !SequenceAccessionTools.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "ABC12345" ).equals( "ABC12345" ) ) { return false; } - if ( !SequenceAccessionTools.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) { + if ( !SequenceAccessionTools.parseGenbankAccessorFromString( "&ABC12345&" ).equals( "ABC12345" ) ) { return false; } - if ( SequenceAccessionTools.parseGenbankAccessor( "ABCD12345" ) != null ) { + if ( SequenceAccessionTools.parseGenbankAccessorFromString( "ABCD12345" ) != null ) { return false; } return true; @@ -3672,166 +3672,166 @@ public final class Test { try { PhylogenyNode n = new PhylogenyNode(); n.setName( "tr|B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr.B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr=B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr-B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr/B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr\\B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "tr_B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( " tr|B3RJ64 " ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "-tr|B3RJ64-" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "-tr=B3RJ64-" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "_tr=B3RJ64_" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( " tr_tr|B3RJ64_sp|123 " ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "sp|B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "sp|B3RJ64C" ); - if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) { return false; } n.setName( "sp B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n.setName( "sp|B3RJ6X" ); - if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) { return false; } n.setName( "sp|B3RJ6" ); - if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) { return false; } n.setName( "K1PYK7_CRAGI" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } n.setName( "K1PYK7_PEA" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_PEA" ) ) { return false; } n.setName( "K1PYK7_RAT" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_RAT" ) ) { return false; } n.setName( "K1PYK7_PIG" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_PIG" ) ) { return false; } n.setName( "~K1PYK7_PIG~" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_PIG" ) ) { return false; } n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } n.setName( "K1PYKX_CRAGI" ); - if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) { return false; } n.setName( "XXXXX_CRAGI" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "XXXXX_CRAGI" ) ) { return false; } n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "H3IB65" ) ) { return false; } n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" ); - if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + if ( SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ) != null ) { return false; } n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "Q86U06" ) ) { return false; } n = new PhylogenyNode(); org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence(); seq.setSymbol( "K1PYK7_CRAGI" ); n.getNodeData().addSequence( seq ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } seq.setSymbol( "tr|B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n = new PhylogenyNode(); seq = new org.forester.phylogeny.data.Sequence(); seq.setName( "K1PYK7_CRAGI" ); n.getNodeData().addSequence( seq ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK7_CRAGI" ) ) { return false; } seq.setName( "tr|B3RJ64" ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } n = new PhylogenyNode(); seq = new org.forester.phylogeny.data.Sequence(); seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) ); n.getNodeData().addSequence( seq ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "K1PYK8_CRAGI" ) ) { return false; } n = new PhylogenyNode(); seq = new org.forester.phylogeny.data.Sequence(); seq.setAccession( new Accession( "tr|B3RJ64", "?" ) ); n.getNodeData().addSequence( seq ); - if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + if ( !SequenceAccessionTools.obtainUniProtAccessorFromDataFields( n ).equals( "B3RJ64" ) ) { return false; } // n = new PhylogenyNode(); n.setName( "ACP19736" ); - if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { + if ( !SequenceAccessionTools.obtainGenbankAccessorFromDataFields( n ).equals( "ACP19736" ) ) { return false; } n = new PhylogenyNode(); n.setName( "_ACP19736_" ); - if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { + if ( !SequenceAccessionTools.obtainGenbankAccessorFromDataFields( n ).equals( "ACP19736" ) ) { return false; } } @@ -9630,7 +9630,7 @@ public final class Test { private static boolean testSequenceIdParsing() { try { - Accession id = SequenceAccessionTools.parse( "gb_ADF31344_segmented_worms_" ); + Accession id = SequenceAccessionTools.parseAccessorFromString( "gb_ADF31344_segmented_worms_" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { @@ -9640,7 +9640,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "segmented worms|gb_ADF31344" ); + id = SequenceAccessionTools.parseAccessorFromString( "segmented worms|gb_ADF31344" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { @@ -9650,7 +9650,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "segmented worms gb_ADF31344 and more" ); + id = SequenceAccessionTools.parseAccessorFromString( "segmented worms gb_ADF31344 and more" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { @@ -9660,7 +9660,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "gb_AAA96518_1" ); + id = SequenceAccessionTools.parseAccessorFromString( "gb_AAA96518_1" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "AAA96518" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { @@ -9670,7 +9670,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "gb_EHB07727_1_rodents_" ); + id = SequenceAccessionTools.parseAccessorFromString( "gb_EHB07727_1_rodents_" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "EHB07727" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { @@ -9680,7 +9680,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "dbj_BAF37827_1_turtles_" ); + id = SequenceAccessionTools.parseAccessorFromString( "dbj_BAF37827_1_turtles_" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "BAF37827" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { @@ -9690,7 +9690,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "emb_CAA73223_1_primates_" ); + id = SequenceAccessionTools.parseAccessorFromString( "emb_CAA73223_1_primates_" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "CAA73223" ) || !id.getSource().equals( "ncbi" ) ) { if ( id != null ) { @@ -9700,7 +9700,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "mites|ref_XP_002434188_1" ); + id = SequenceAccessionTools.parseAccessorFromString( "mites|ref_XP_002434188_1" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) { if ( id != null ) { @@ -9710,7 +9710,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "mites_ref_XP_002434188_1_bla_XP_12345" ); + id = SequenceAccessionTools.parseAccessorFromString( "mites_ref_XP_002434188_1_bla_XP_12345" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) { if ( id != null ) { @@ -9720,7 +9720,7 @@ public final class Test { return false; } // - id = SequenceAccessionTools.parse( "P4A123" ); + id = SequenceAccessionTools.parseAccessorFromString( "P4A123" ); if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() ) || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "uniprot" ) ) { if ( id != null ) { @@ -9740,7 +9740,7 @@ public final class Test { // return false; // } // - id = SequenceAccessionTools.parse( "XP_12345" ); + id = SequenceAccessionTools.parseAccessorFromString( "XP_12345" ); if ( id != null ) { System.out.println( "value =" + id.getValue() ); System.out.println( "provider=" + id.getSource() ); diff --git a/forester/java/src/org/forester/util/SequenceAccessionTools.java b/forester/java/src/org/forester/util/SequenceAccessionTools.java index 4136049..c65fbc5 100644 --- a/forester/java/src/org/forester/util/SequenceAccessionTools.java +++ b/forester/java/src/org/forester/util/SequenceAccessionTools.java @@ -37,12 +37,6 @@ import org.forester.phylogeny.data.Sequence; public final class SequenceAccessionTools { - public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern - .compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" ); - public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern - .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); - public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern - .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" ); // gb_ADF31344_1_segmented_worms_ // gb_AAA96518_1 // gb_EHB07727_1_rodents_ @@ -56,207 +50,194 @@ public final class SequenceAccessionTools { //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals //Protein: 3 letters + 5 numerals //http://www.ncbi.nlm.nih.gov/Sequin/acc.html - private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); - private final static Pattern GI_PATTERN = Pattern - .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" ); + public final static Pattern GENBANK_NUC_PATTERN_1 = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); + public final static Pattern GENBANK_NUC_PATTERN_2 = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); + public final static Pattern GENBANK_PROT_PATTERN = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); + public final static Pattern GI_PATTERN = Pattern.compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" ); + public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern.compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" ); + public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern + .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); + public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern + .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" ); // RefSeq accession numbers can be distinguished from GenBank accessions // by their distinct prefix format of 2 characters followed by an // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. - private final static Pattern REFSEQ_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" ); + private final static Pattern REFSEQ_PATTERN = Pattern + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" ); private SequenceAccessionTools() { // Hiding the constructor. } - public static String extractGenbankAccessor( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { - v = parseGenbankAccessor( seq.getSymbol() ); - } - if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) { - v = parseGenbankAccessor( seq.getGeneName() ); - } - if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) { - v = parseGenbankAccessor( seq.getName() ); - } - if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { - v = parseGenbankAccessor( seq.getAccession().getValue() ); - } + public final static boolean isProteinDbQuery( final String query ) { + final String r1 = parseRefSeqAccessorFromString( query ); + if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) { + return true; } - if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) { - v = parseGenbankAccessor( node.getName() ); + final String r2 = parseUniProtAccessorFromString( query ); + if ( !ForesterUtil.isEmpty( r2 ) ) { + return true; } - return v; + return GENBANK_PROT_PATTERN.matcher( query ).lookingAt(); } - public static String extractGInumber( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) { - v = parseGInumber( seq.getName() ); + public final static Accession obtainAccessorFromDataFields( final PhylogenyNode n ) { + String a = obtainUniProtAccessorFromDataFields( n ); + if ( !ForesterUtil.isEmpty( a ) ) { + return new Accession( a, Accession.UNIPROT ); + } + a = obtainGenbankAccessorFromDataFields( n ); + if ( !ForesterUtil.isEmpty( a ) ) { + return new Accession( a, Accession.NCBI ); + } + a = obtainRefSeqAccessorFromDataFields( n ); + if ( !ForesterUtil.isEmpty( a ) ) { + return new Accession( a, Accession.REFSEQ ); + } + a = obtainGiNumberFromDataFields( n ); + if ( !ForesterUtil.isEmpty( a ) ) { + return new Accession( a, Accession.GI ); + } + return null; + } + + public final static Accession obtainFromSeqAccession( final PhylogenyNode n ) { + if ( n.getNodeData().isHasSequence() && ( n.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getAccession().getSource() ) + && !ForesterUtil.isEmpty( n.getNodeData().getSequence().getAccession().getValue() ) ) { + final String source = n.getNodeData().getSequence().getAccession().getSource().toLowerCase(); + final String value = n.getNodeData().getSequence().getAccession().getValue(); + if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source + .equals( "sp" ) ) ) { + return new Accession( value, Accession.UNIPROT ); } - if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { - v = parseGInumber( seq.getAccession().getValue() ); + else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) { + return new Accession( value, Accession.EMBL ); + } + else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) { + return new Accession( value, Accession.NCBI ); + } + else if ( source.equals( "refseq" ) ) { + return new Accession( value, Accession.REFSEQ ); + } + else if ( source.equals( "gi" ) ) { + return new Accession( value, Accession.GI ); } } - if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) { - v = parseGInumber( node.getName() ); - } - return v; + return null; } - public static String extractRefSeqAccessor( final PhylogenyNode node ) { - String v = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); + public final static String obtainGenbankAccessorFromDataFields( final PhylogenyNode n ) { + String a = null; + if ( n.getNodeData().isHasSequence() ) { + final Sequence seq = n.getNodeData().getSequence(); if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { - v = parseRefSeqAccessor( seq.getSymbol() ); + a = parseGenbankAccessorFromString( seq.getSymbol() ); } if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) { - v = parseRefSeqAccessor( seq.getGeneName() ); + a = parseGenbankAccessorFromString( seq.getGeneName() ); } - if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) { - v = parseRefSeqAccessor( seq.getName() ); + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) { + a = parseGenbankAccessorFromString( seq.getName() ); } - if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) + if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { - v = parseRefSeqAccessor( seq.getAccession().getValue() ); + a = parseGenbankAccessorFromString( seq.getAccession().getValue() ); } } - if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) { - v = parseRefSeqAccessor( node.getName() ); + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) { + a = parseGenbankAccessorFromString( n.getName() ); } - return v; + return a; } - public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) { + public final static String obtainGiNumberFromDataFields( final PhylogenyNode n ) { String a = null; - if ( node.getNodeData().isHasSequence() ) { - final Sequence seq = node.getNodeData().getSequence(); - if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { - a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getSymbol() ); - } + if ( n.getNodeData().isHasSequence() ) { + final Sequence seq = n.getNodeData().getSequence(); if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) { - a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getName() ); + a = parseGInumberFromString( seq.getName() ); } if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) { - a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getGeneName() ); + a = parseGInumberFromString( seq.getGeneName() ); } - if ( ForesterUtil.isEmpty( a ) && ( node.getNodeData().getSequence().getAccession() != null ) + if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { - a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getAccession().getValue() ); + a = parseGInumberFromString( seq.getAccession().getValue() ); } } - if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( node.getName() ) ) { - a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node.getName() ); + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) { + a = parseGInumberFromString( n.getName() ); } return a; } - public static String extractUniProtKbProteinSeqIdentifier( final String str ) { - Matcher m = UNIPROT_KB_PATTERN_0.matcher( str ); - if ( m.find() ) { - return m.group( 1 ); - } - m = UNIPROT_KB_PATTERN_1.matcher( str ); - if ( m.find() ) { - return m.group( 1 ); - } - m = UNIPROT_KB_PATTERN_2.matcher( str ); - if ( m.find() ) { - return m.group(); - } - return null; - } - - public final static boolean isProtein( final String query ) { - final String r1 = parseRefSeqAccessor( query ); - if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) { - return true; - } - final String r2 = extractUniProtKbProteinSeqIdentifier( query ); - if ( !ForesterUtil.isEmpty( r2 ) ) { - return true; - } - return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt(); - } - - public final static Accession parse( final PhylogenyNode n ) { - String v = extractUniProtKbProteinSeqIdentifier( n ); - if ( !ForesterUtil.isEmpty( v ) ) { - return new Accession( v, Accession.UNIPROT ); - } - v = extractGenbankAccessor( n ); - if ( !ForesterUtil.isEmpty( v ) ) { - return new Accession( v, Accession.NCBI ); - } - v = extractRefSeqAccessor( n ); - if ( !ForesterUtil.isEmpty( v ) ) { - return new Accession( v, Accession.REFSEQ ); + public final static String obtainRefSeqAccessorFromDataFields( final PhylogenyNode n ) { + String a = null; + if ( n.getNodeData().isHasSequence() ) { + final Sequence seq = n.getNodeData().getSequence(); + if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { + a = parseRefSeqAccessorFromString( seq.getSymbol() ); + } + if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) { + a = parseRefSeqAccessorFromString( seq.getGeneName() ); + } + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) { + a = parseRefSeqAccessorFromString( seq.getName() ); + } + if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { + a = parseRefSeqAccessorFromString( seq.getAccession().getValue() ); + } } - v = extractGInumber( n ); - if ( !ForesterUtil.isEmpty( v ) ) { - return new Accession( v, Accession.GI ); + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) { + a = parseRefSeqAccessorFromString( n.getName() ); } - return null; + return a; } - public final static Accession obtainFromSeqAccession( final PhylogenyNode node ) { - if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) { - final String source = node.getNodeData().getSequence().getAccession().getSource().toLowerCase(); - final String value = node.getNodeData().getSequence().getAccession().getValue(); - if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source - .equals( "sp" ) ) ) { - return new Accession( value, Accession.UNIPROT ); - } - else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) { - return new Accession( value, Accession.EMBL ); + public final static String obtainUniProtAccessorFromDataFields( final PhylogenyNode n ) { + String a = null; + if ( n.getNodeData().isHasSequence() ) { + final Sequence seq = n.getNodeData().getSequence(); + if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { + a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getSymbol() ); } - else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) { - return new Accession( value, Accession.NCBI ); + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) { + a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getName() ); } - else if ( source.equals( "refseq" ) ) { - return new Accession( value, Accession.REFSEQ ); + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) { + a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getGeneName() ); } - else if ( source.equals( "gi" ) ) { - return new Accession( value, Accession.GI ); + if ( ForesterUtil.isEmpty( a ) && ( n.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { + a = SequenceAccessionTools.parseUniProtAccessorFromString( seq.getAccession().getValue() ); } } - return null; + if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( n.getName() ) ) { + a = SequenceAccessionTools.parseUniProtAccessorFromString( n.getName() ); + } + return a; } - /** - * Returns null if no match. - * - */ - public final static Accession parse( final String s ) { + public final static Accession parseAccessorFromString( final String s ) { if ( !ForesterUtil.isEmpty( s ) ) { - String v = extractUniProtKbProteinSeqIdentifier( s ); + String v = parseUniProtAccessorFromString( s ); if ( !ForesterUtil.isEmpty( v ) ) { return new Accession( v, Accession.UNIPROT ); } - v = parseGenbankAccessor( s ); + v = parseGenbankAccessorFromString( s ); if ( !ForesterUtil.isEmpty( v ) ) { return new Accession( v, Accession.NCBI ); } - v = parseRefSeqAccessor( s ); + v = parseRefSeqAccessorFromString( s ); if ( !ForesterUtil.isEmpty( v ) ) { return new Accession( v, Accession.REFSEQ ); } - v = parseGInumber( s ); + v = parseGInumberFromString( s ); if ( !ForesterUtil.isEmpty( v ) ) { return new Accession( v, Accession.GI ); } @@ -264,22 +245,18 @@ public final class SequenceAccessionTools { return null; } - /** - * Returns null if no match. - * - */ - public static String parseGenbankAccessor( final String query ) { - Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query ); + public final static String parseGenbankAccessorFromString( final String s ) { + Matcher m = GENBANK_NUC_PATTERN_1.matcher( s ); if ( m.lookingAt() ) { return m.group( 1 ); } else { - m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query ); + m = GENBANK_NUC_PATTERN_2.matcher( s ); if ( m.lookingAt() ) { return m.group( 1 ); } else { - m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); + m = GENBANK_PROT_PATTERN.matcher( s ); if ( m.lookingAt() ) { return m.group( 1 ); } @@ -290,8 +267,8 @@ public final class SequenceAccessionTools { } } - public static String parseGenbankProteinAccessor( final String query ) { - final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query ); + public final static String parseGenbankProteinAccessorFromString( final String s ) { + final Matcher m = GENBANK_PROT_PATTERN.matcher( s ); if ( m.lookingAt() ) { return m.group( 1 ); } @@ -300,23 +277,35 @@ public final class SequenceAccessionTools { } } - public static String parseGInumber( final String query ) { - final Matcher m = GI_PATTERN.matcher( query ); + public final static String parseGInumberFromString( final String s ) { + final Matcher m = GI_PATTERN.matcher( s ); if ( m.find() ) { return m.group( 1 ); } return null; } - /** - * Returns null if no match. - * - */ - public final static String parseRefSeqAccessor( final String query ) { - final Matcher m = REFSEQ_PATTERN.matcher( query ); + public final static String parseRefSeqAccessorFromString( final String s ) { + final Matcher m = REFSEQ_PATTERN.matcher( s ); if ( m.lookingAt() ) { return m.group( 1 ); } return null; } + + public final static String parseUniProtAccessorFromString( final String s ) { + Matcher m = UNIPROT_KB_PATTERN_0.matcher( s ); + if ( m.find() ) { + return m.group( 1 ); + } + m = UNIPROT_KB_PATTERN_1.matcher( s ); + if ( m.find() ) { + return m.group( 1 ); + } + m = UNIPROT_KB_PATTERN_2.matcher( s ); + if ( m.find() ) { + return m.group(); + } + return null; + } } diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index c40b37e..17c56d1 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -26,6 +26,7 @@ package org.forester.ws.seqdb; import java.io.BufferedReader; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; @@ -58,7 +59,7 @@ public final class SequenceDbWsTools { public final static String EMBL_DBS_REFSEQ_P = "refseqp"; public final static String EMBL_DBS_REFSEQ_N = "refseqn"; private final static String URL_ENC = "UTF-8"; - private final static boolean DEBUG = false; + private final static boolean DEBUG = true; private static List getTaxonomiesFromCommonName( final String cn, final int max_taxonomies_return ) throws IOException { @@ -159,96 +160,13 @@ public final class SequenceDbWsTools { if ( ext_nodes_only && node.isInternal() ) { continue; } - // String query = null; - // Accession id = null; - // Accession acc = SequenceAccessionTools.obtain( node ); - // - // - // Db db = Db.NONE; - // if ( node.getNodeData().isHasSequence() - // && ( node.getNodeData().getSequence().getAccession() != null ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - // && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase() - // .startsWith( "uniprot" ) - // || node.getNodeData().getSequence().getAccession().getValue() - // .equalsIgnoreCase( "swissprot" ) - // || node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "trembl" ) || node - // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "sp" ) ) ) { - // query = node.getNodeData().getSequence().getAccession().getValue(); - // db = Db.UNIPROT; - // } - // else if ( node.getNodeData().isHasSequence() - // && ( node.getNodeData().getSequence().getAccession() != null ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - // && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "embl" ) || node - // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ebi" ) ) ) { - // query = node.getNodeData().getSequence().getAccession().getValue(); - // db = Db.EMBL; - // } - // else if ( node.getNodeData().isHasSequence() - // && ( node.getNodeData().getSequence().getAccession() != null ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - // && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ncbi" ) || node - // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "genbank" ) ) ) { - // query = node.getNodeData().getSequence().getAccession().getValue(); - // // db = Db.NCBI; - // } - // else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - // && node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "refseq" ) ) { - // query = node.getNodeData().getSequence().getAccession().getValue(); - // db = Db.REFSEQ; - // } - // else { Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node ); - // if ( ( query = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) { - // db = Db.UNIPROT; - // } - // else if ( node.getNodeData().isHasSequence() ) { - // if ( ( id = SequenceAccessionTools.parse( node.getName() ) ) != null ) { - // if ( id.getSource() == Accession.NCBI ) { - // // db = Db.NCBI; - // } - // else if ( id.getSource() == Accession.REFSEQ ) { - // db = Db.REFSEQ; - // } - // } - // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() ) ) != null ) { - // if ( id.getSource() == Accession.NCBI ) { - // // = Db.NCBI; - // } - // else if ( id.getSource() == Accession.REFSEQ ) { - // db = Db.REFSEQ; - // } - // } - // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) { - // if ( id.getSource() == Accession.NCBI ) { - // // db = Db.NCBI; - // } - // else if ( id.getSource() == Accession.REFSEQ ) { - // db = Db.REFSEQ; - // } - // } - // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) { - // if ( id.getSource() == Accession.NCBI ) { - // // db = Db.NCBI; - // } - // else if ( id.getSource() == Accession.REFSEQ ) { - // db = Db.REFSEQ; - // } - // } - // } - // } if ( ( acc == null ) || ForesterUtil.isEmpty( acc.getSource() ) || ForesterUtil.isEmpty( acc.getValue() ) || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc .getSource() != Accession.REFSEQ ) ) ) { - acc = SequenceAccessionTools.parse( node ); + acc = SequenceAccessionTools.obtainAccessorFromDataFields( node ); } if ( ( acc == null ) || ForesterUtil.isEmpty( acc.getSource() ) @@ -264,19 +182,34 @@ public final class SequenceDbWsTools { if ( DEBUG ) { System.out.println( "uniprot: " + query ); } - db_entry = obtainUniProtEntry( query, lines_to_return ); + try { + db_entry = obtainUniProtEntry( query, lines_to_return ); + } + catch ( FileNotFoundException e ) { + // Eat this, and move to next. + } } else if ( acc.getSource() == Accession.EMBL ) { if ( DEBUG ) { System.out.println( "embl: " + query ); } - db_entry = obtainEmblEntry( new Accession( query ), lines_to_return ); + try { + db_entry = obtainEmblEntry( new Accession( query ), lines_to_return ); + } + catch ( FileNotFoundException e ) { + // Eat this, and move to next. + } } else if ( acc.getSource() == Accession.REFSEQ ) { if ( DEBUG ) { System.out.println( "refseq: " + query ); } - db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); + try { + db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return ); + } + catch ( FileNotFoundException e ) { + // Eat this, and move to next. + } } if ( ( db_entry != null ) && !db_entry.isEmpty() ) { final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() @@ -322,6 +255,7 @@ public final class SequenceDbWsTools { node.getNodeData().setSequence( seq ); } else { + node.i not_found.add( node.getName() ); } try { diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java index 2eaa720..4a8d158 100644 --- a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java @@ -132,7 +132,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { if ( _go_terms == null ) { _go_terms = new ArrayList(); } - System.out.println( "GOTERM ADDED: " + g ); _go_terms.add( g ); } @@ -209,7 +208,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { else if ( ns_str.equals( "C" ) ) { gns = GoNameSpace.CELLULAR_COMPONENT_STR; } - System.out.println( "GO:" + id + " " + desc + " " + ns_str ); e.addGoTerm( new BasicGoTerm( id, desc, gns, false ) ); } }