From 9cb3f8c0c0ddf4045d97c04784ceb419397e8b06 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Wed, 6 Mar 2013 04:29:05 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/archaeopteryx/AptxUtil.java | 86 ++++++- .../src/org/forester/archaeopteryx/TreePanel.java | 149 ++----------- forester/java/src/org/forester/test/Test.java | 235 ++++++++++++++++++++ .../java/src/org/forester/util/ForesterUtil.java | 108 +++++++++ .../src/org/forester/util/SequenceIdParser.java | 2 +- 5 files changed, 448 insertions(+), 132 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java index 18e80e7..823e18f 100644 --- a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java @@ -36,10 +36,12 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.net.URI; import java.net.URL; +import java.net.URLEncoder; import java.text.ParseException; import java.util.Arrays; import java.util.HashMap; @@ -76,6 +78,7 @@ import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.BranchColor; +import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory; import org.forester.phylogeny.factories.PhylogenyFactory; @@ -83,17 +86,13 @@ import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.phylogeny.iterators.PreorderTreeIterator; import org.forester.util.AsciiHistogram; import org.forester.util.DescriptiveStatistics; +import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; +import org.forester.util.SequenceIdParser; import org.forester.ws.seqdb.UniProtTaxonomy; public final class AptxUtil { - final static String UNIPROT_KB = "http://www.uniprot.org/uniprot/"; - final static Pattern UNIPROT_KB_PATTERN_1 = Pattern - .compile( "\\b(sp|tr)\\W([A-Z0-9]{5,6})\\b" ); - - final static Pattern UNIPROT_KB_PATTERN_2 = Pattern - .compile( "\\b[A-Z0-9]{5,6}_[A-Z9][A-Z]{2}[A-Z0-9]{2}|RAT|PIG|PEA\\b" ); private final static Pattern seq_identifier_pattern_1 = Pattern .compile( "^([A-Za-z]{2,5})[|=:]([0-9A-Za-z_\\.]{5,40})\\s*$" ); private final static Pattern seq_identifier_pattern_2 = Pattern @@ -104,6 +103,81 @@ public final class AptxUtil { Arrays.sort( AVAILABLE_FONT_FAMILIES_SORTED ); } + public final static String createUriForSeqWeb( final PhylogenyNode node, + final Configuration conf, + final TreePanel tp ) { + String uri_str = null; + if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + && conf.isHasWebLink( node.getNodeData().getSequence().getAccession().getSource().toLowerCase() ) ) { + final Sequence seq = node.getNodeData().getSequence(); + final String source = seq.getAccession().getSource().toLowerCase(); + String url; + if ( source.toLowerCase().equals( "ncbi" ) ) { + url = Constants.NCBI_ALL_DATABASE_SEARCH; + } + else { + final WebLink weblink = conf.getWebLink( source ); + url = weblink.getUrl().toString(); + } + try { + uri_str = url + URLEncoder.encode( seq.getAccession().getValue(), ForesterConstants.UTF8 ); + } + catch ( final UnsupportedEncodingException e ) { + showErrorMessage( tp, e.toString() ); + e.printStackTrace(); + } + } + if ( ForesterUtil.isEmpty( uri_str ) ) { + final String upkb = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ); + if ( !ForesterUtil.isEmpty( upkb ) ) { + try { + uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 ); + } + catch ( final UnsupportedEncodingException e ) { + showErrorMessage( tp, e.toString() ); + e.printStackTrace(); + } + } + } + if ( ForesterUtil.isEmpty( uri_str ) ) { + final String v = ForesterUtil.extractGenbankAccessor( node ); + if ( !ForesterUtil.isEmpty( v ) ) { + try { + if ( SequenceIdParser.isProtein( v ) ) { + uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 ); + } + else { + uri_str = ForesterUtil.NCBI_NUCCORE + URLEncoder.encode( v, ForesterConstants.UTF8 ); + } + } + catch ( final UnsupportedEncodingException e ) { + showErrorMessage( tp, e.toString() ); + e.printStackTrace(); + } + } + } + if ( ForesterUtil.isEmpty( uri_str ) ) { + final String v = ForesterUtil.extractRefSeqAccessorAccessor( node ); + if ( !ForesterUtil.isEmpty( v ) ) { + try { + if ( SequenceIdParser.isProtein( v ) ) { + uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 ); + } + else { + uri_str = ForesterUtil.NCBI_NUCCORE + URLEncoder.encode( v, ForesterConstants.UTF8 ); + } + } + catch ( final UnsupportedEncodingException e ) { + showErrorMessage( tp, e.toString() ); + e.printStackTrace(); + } + } + } + return uri_str; + } + public static MaskFormatter createMaskFormatter( final String s ) { MaskFormatter formatter = null; try { diff --git a/forester/java/src/org/forester/archaeopteryx/TreePanel.java b/forester/java/src/org/forester/archaeopteryx/TreePanel.java index b94080f..07a99bd 100644 --- a/forester/java/src/org/forester/archaeopteryx/TreePanel.java +++ b/forester/java/src/org/forester/archaeopteryx/TreePanel.java @@ -77,7 +77,6 @@ import java.util.Hashtable; import java.util.List; import java.util.Set; import java.util.SortedSet; -import java.util.regex.Matcher; import javax.swing.BorderFactory; import javax.swing.JApplet; @@ -3142,7 +3141,14 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee final String title = clickto_names.get( i ); _node_popup_menu_items[ i ] = new JMenuItem( title ); if ( title.equals( Configuration.clickto_options[ Configuration.open_seq_web ][ 0 ] ) ) { - _node_popup_menu_items[ i ].setEnabled( isCanOpenSeqWeb( node ) ); + final String id = isCanOpenSeqWeb( node ); + if ( !ForesterUtil.isEmpty( id ) ) { + _node_popup_menu_items[ i ].setText( _node_popup_menu_items[ i ].getText() + " [" + id + "]" ); + _node_popup_menu_items[ i ].setEnabled( true ); + } + else { + _node_popup_menu_items[ i ].setEnabled( false ); + } } else if ( title.equals( Configuration.clickto_options[ Configuration.open_tax_web ][ 0 ] ) ) { _node_popup_menu_items[ i ].setEnabled( isCanOpenTaxWeb( node ) ); @@ -3233,137 +3239,31 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } } - final private boolean isCanOpenSeqWeb( final PhylogenyNode node ) { + final private String isCanOpenSeqWeb( final PhylogenyNode node ) { if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) && getConfiguration().isHasWebLink( node.getNodeData().getSequence().getAccession().getSource() .toLowerCase() ) ) { - return true; + return node.getNodeData().getSequence().getAccession().getSource(); } - if ( !ForesterUtil.isEmpty( node.getName() ) - && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( node.getName() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2 - .matcher( node.getName() ).find() ) ) { - return true; + String v = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ); + if ( ForesterUtil.isEmpty( v ) ) { + v = ForesterUtil.extractGenbankAccessor( node ); } - if ( node.getNodeData().isHasSequence() ) { - Sequence seq = node.getNodeData().getSequence(); - if ( !ForesterUtil.isEmpty( seq.getName() ) - && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getName() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2 - .matcher( seq.getName() ).find() ) ) { - return true; - } - if ( !ForesterUtil.isEmpty( seq.getSymbol() ) - && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2 - .matcher( seq.getSymbol() ).find() ) ) { - return true; - } - if ( ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) - && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2 - .matcher( seq.getAccession().getValue() ).find() ) ) { - return true; - } + if ( ForesterUtil.isEmpty( v ) ) { + v = ForesterUtil.extractRefSeqAccessorAccessor( node ); } - return false; + return v; } final private void openSeqWeb( final PhylogenyNode node ) { - if ( !isCanOpenSeqWeb( node ) ) { + if ( ForesterUtil.isEmpty( isCanOpenSeqWeb( node ) ) ) { cannotOpenBrowserWarningMessage( "sequence" ); return; } - String uri_str = null; - if ( node.getNodeData().isHasSequence() - && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) - && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) - && getConfiguration().isHasWebLink( node.getNodeData().getSequence().getAccession().getSource() - .toLowerCase() ) ) { - final Sequence seq = node.getNodeData().getSequence(); - final String source = seq.getAccession().getSource().toLowerCase(); - String url; - if ( source.toLowerCase().equals( "ncbi" ) ) { - url = Constants.NCBI_ALL_DATABASE_SEARCH; - } - else { - final WebLink weblink = getConfiguration().getWebLink( source ); - url = weblink.getUrl().toString(); - } - try { - uri_str = url + URLEncoder.encode( seq.getAccession().getValue(), ForesterConstants.UTF8 ); - } - catch ( final UnsupportedEncodingException e ) { - AptxUtil.showErrorMessage( this, e.toString() ); - e.printStackTrace(); - } - } - else { - String upkb = null; - if ( node.getNodeData().isHasSequence() ) { - Sequence seq = node.getNodeData().getSequence(); - Matcher m; - if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { - m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() ); - if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) { - if ( m.find() ) { - upkb = m.group( 2 ); - } - else { - m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - } - if ( ForesterUtil.isEmpty( upkb ) && !ForesterUtil.isEmpty( seq.getName() ) ) { - m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getName() ); - if ( m.find() ) { - upkb = m.group( 2 ); - } - else { - m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getName() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - if ( ForesterUtil.isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null ) - && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) { - m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() ); - if ( m.find() ) { - upkb = m.group( 2 ); - } - else { - m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() ); - if ( m.find() ) { - upkb = m.group(); - } - } - } - } - if ( ForesterUtil.isEmpty( upkb ) && !ForesterUtil.isEmpty( node.getName() ) ) { - final Matcher m1 = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( node.getName() ); - if ( m1.find() ) { - upkb = m1.group( 2 ); - } - else { - final Matcher m2 = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( node.getName() ); - if ( m2.find() ) { - upkb = m2.group(); - } - } - } - try { - uri_str = AptxUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 ); - } - catch ( final UnsupportedEncodingException e ) { - AptxUtil.showErrorMessage( this, e.toString() ); - e.printStackTrace(); - } - } + final String uri_str = AptxUtil.createUriForSeqWeb( node, getConfiguration(), this ); if ( !ForesterUtil.isEmpty( uri_str ) ) { try { AptxUtil.launchWebBrowser( new URI( uri_str ), @@ -3417,7 +3317,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) { try { - uri_str = "http://www.eol.org/search?q=" + uri_str = "http://www.uniprot.org/taxonomy/?query=" + URLEncoder.encode( tax.getScientificName(), ForesterConstants.UTF8 ); } catch ( final UnsupportedEncodingException e ) { @@ -3437,7 +3337,7 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } else if ( !ForesterUtil.isEmpty( tax.getCommonName() ) ) { try { - uri_str = "http://www.eol.org/search?q=" + uri_str = "http://www.uniprot.org/taxonomy/?query=" + URLEncoder.encode( tax.getCommonName(), ForesterConstants.UTF8 ); } catch ( final UnsupportedEncodingException e ) { @@ -3447,11 +3347,10 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee } if ( !ForesterUtil.isEmpty( uri_str ) ) { try { - JApplet applet = null; - if ( isApplet() ) { - applet = obtainApplet(); - } - AptxUtil.launchWebBrowser( new URI( uri_str ), isApplet(), applet, "_aptx_tax" ); + AptxUtil.launchWebBrowser( new URI( uri_str ), + isApplet(), + isApplet() ? obtainApplet() : null, + "_aptx_tax" ); } catch ( final IOException e ) { AptxUtil.showErrorMessage( this, e.toString() ); diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index bd8b586..c13523c 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -38,6 +38,7 @@ import java.util.Locale; import java.util.Set; import org.forester.application.support_transfer; +import org.forester.archaeopteryx.AptxUtil; import org.forester.development.DevelopmentTools; import org.forester.evoinference.TestPhylogenyReconstruction; import org.forester.evoinference.matrix.character.CharacterStateMatrix; @@ -67,6 +68,7 @@ import org.forester.phylogeny.PhylogenyBranch; import org.forester.phylogeny.PhylogenyMethods; import org.forester.phylogeny.PhylogenyNode; import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE; +import org.forester.phylogeny.data.Accession; import org.forester.phylogeny.data.BinaryCharacters; import org.forester.phylogeny.data.BranchWidth; import org.forester.phylogeny.data.Confidence; @@ -216,6 +218,24 @@ public final class Test { System.out.println( "failed." ); failed++; } + System.out.print( "UniProtKB id extraction: " ); + if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } + System.out.print( "Uri for Aptx web sequence accession: " ); + if ( Test.testCreateUriForSeqWeb() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } System.out.print( "Basic node construction and parsing of NHX (node level): " ); if ( Test.testNHXNodeParsing() ) { System.out.println( "OK." ); @@ -824,6 +844,221 @@ public final class Test { } } + private static boolean testExtractUniProtKbProteinSeqIdentifier() { + try { + PhylogenyNode n = new PhylogenyNode(); + n.setName( "tr|B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "tr.B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "tr=B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "tr-B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "tr/B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "tr\\B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "tr_B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( " tr|B3RJ64 " ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "-tr|B3RJ64-" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "-tr=B3RJ64-" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "_tr=B3RJ64_" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( " tr_tr|B3RJ64_sp|123 " ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "sp|B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n.setName( "ssp|B3RJ64" ); + if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + return false; + } + n.setName( "sp|B3RJ64C" ); + if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + return false; + } + n.setName( "sp B3RJ64" ); + if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + return false; + } + n.setName( "sp|B3RJ6X" ); + if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + return false; + } + n.setName( "sp|B3RJ6" ); + if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + return false; + } + n.setName( "K1PYK7_CRAGI" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + return false; + } + n.setName( "K1PYK7_PEA" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) { + return false; + } + n.setName( "K1PYK7_RAT" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) { + return false; + } + n.setName( "K1PYK7_PIG" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { + return false; + } + n.setName( "~K1PYK7_PIG~" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) { + return false; + } + n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + return false; + } + n.setName( "K1PYKX_CRAGI" ); + if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + return false; + } + n.setName( "XXXXX_CRAGI" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) { + return false; + } + n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) { + return false; + } + n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" ); + if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) { + return false; + } + n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) { + return false; + } + n = new PhylogenyNode(); + org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence(); + seq.setSymbol( "K1PYK7_CRAGI" ); + n.getNodeData().addSequence( seq ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + return false; + } + seq.setSymbol( "tr|B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n = new PhylogenyNode(); + seq = new org.forester.phylogeny.data.Sequence(); + seq.setName( "K1PYK7_CRAGI" ); + n.getNodeData().addSequence( seq ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) { + return false; + } + seq.setName( "tr|B3RJ64" ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + n = new PhylogenyNode(); + seq = new org.forester.phylogeny.data.Sequence(); + seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) ); + n.getNodeData().addSequence( seq ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) { + return false; + } + n = new PhylogenyNode(); + seq = new org.forester.phylogeny.data.Sequence(); + seq.setAccession( new Accession( "tr|B3RJ64", "?" ) ); + n.getNodeData().addSequence( seq ); + if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) { + return false; + } + // + n = new PhylogenyNode(); + n.setName( "ACP19736" ); + if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { + return false; + } + n = new PhylogenyNode(); + n.setName( "_ACP19736_" ); + if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) { + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + + private static boolean testCreateUriForSeqWeb() { + try { + final PhylogenyNode n = new PhylogenyNode(); + n.setName( "tr|B3RJ64" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "B3RJ64" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); + System.exit( -1 ); + return false; + } + n.setName( "B0LM41_HUMAN" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "B0LM41_HUMAN" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); + System.exit( -1 ); + return false; + } + n.setName( "NP_001025424" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "NP_001025424" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); + System.exit( -1 ); + return false; + } + n.setName( "_NM_001030253-" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_NUCCORE + "NM_001030253" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); + System.exit( -1 ); + return false; + } + n.setName( "NP_001025424" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "NP_001025424" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); + System.exit( -1 ); + return false; + } + } + catch ( final Exception e ) { + e.printStackTrace( System.out ); + return false; + } + return true; + } + private static boolean testExtractTaxonomyCodeFromNodeName() { try { if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "MOUSE", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED ) diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 03c2e69..828d3df 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -59,6 +59,7 @@ import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; +import java.util.regex.Matcher; import java.util.regex.Pattern; import org.forester.phylogeny.PhylogenyNode; @@ -82,6 +83,13 @@ public final class ForesterUtil { public static final NumberFormat FORMATTER_6; public static final NumberFormat FORMATTER_06; public static final NumberFormat FORMATTER_3; + public static final String NCBI_PROTEIN = "http://www.ncbi.nlm.nih.gov/protein/"; + public static final String NCBI_NUCCORE = "http://www.ncbi.nlm.nih.gov/nuccore/"; + public final static String UNIPROT_KB = "http://www.uniprot.org/uniprot/"; + public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern + .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); + public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern + .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" ); static { final DecimalFormatSymbols dfs = new DecimalFormatSymbols(); dfs.setDecimalSeparator( '.' ); @@ -95,6 +103,106 @@ public final class ForesterUtil { private ForesterUtil() { } + public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) { + String v = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + if ( !isEmpty( seq.getSymbol() ) ) { + v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() ); + } + if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { + v = SequenceIdParser.parseRefSeqAccessor( seq.getName() ); + } + if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !isEmpty( seq.getAccession().getValue() ) ) { + v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() ); + } + } + if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { + v = SequenceIdParser.parseRefSeqAccessor( node.getName() ); + } + return v; + } + + public static String extractGenbankAccessor( final PhylogenyNode node ) { + String v = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + if ( !isEmpty( seq.getSymbol() ) ) { + v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() ); + } + if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { + v = SequenceIdParser.parseGenbankAccessor( seq.getName() ); + } + if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !isEmpty( seq.getAccession().getValue() ) ) { + v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() ); + } + } + if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { + v = SequenceIdParser.parseGenbankAccessor( node.getName() ); + } + return v; + } + + public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) { + String upkb = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + Matcher m; + if ( !isEmpty( seq.getSymbol() ) ) { + m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() ); + if ( m.find() ) { + upkb = m.group( 1 ); + } + else { + m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() ); + if ( m.find() ) { + upkb = m.group(); + } + } + } + if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) { + m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() ); + if ( m.find() ) { + upkb = m.group( 1 ); + } + else { + m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() ); + if ( m.find() ) { + upkb = m.group(); + } + } + } + if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !isEmpty( seq.getAccession().getValue() ) ) { + m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() ); + if ( m.find() ) { + upkb = m.group( 1 ); + } + else { + m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() ); + if ( m.find() ) { + upkb = m.group(); + } + } + } + } + if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) { + final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() ); + if ( m1.find() ) { + upkb = m1.group( 1 ); + } + else { + final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() ); + if ( m2.find() ) { + upkb = m2.group(); + } + } + } + return upkb; + } + final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) { if ( sb.length() > 0 ) { sb.append( separator ); diff --git a/forester/java/src/org/forester/util/SequenceIdParser.java b/forester/java/src/org/forester/util/SequenceIdParser.java index c96d0f4..6d2dd37 100644 --- a/forester/java/src/org/forester/util/SequenceIdParser.java +++ b/forester/java/src/org/forester/util/SequenceIdParser.java @@ -125,7 +125,7 @@ public final class SequenceIdParser { * Returns null if no match. * */ - private final static String parseRefSeqAccessor( final String query ) { + public final static String parseRefSeqAccessor( final String query ) { final Matcher m = REFSEQ_PATTERN.matcher( query ); if ( m.lookingAt() ) { return m.group( 1 ); -- 1.7.10.2