import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.UnsupportedEncodingException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URI;
import java.net.URL;
+import java.net.URLEncoder;
import java.text.ParseException;
import java.util.Arrays;
import java.util.HashMap;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.data.Accession;
import org.forester.phylogeny.data.BranchColor;
+import org.forester.phylogeny.data.Sequence;
import org.forester.phylogeny.data.Taxonomy;
import org.forester.phylogeny.factories.ParserBasedPhylogenyFactory;
import org.forester.phylogeny.factories.PhylogenyFactory;
import org.forester.phylogeny.iterators.PreorderTreeIterator;
import org.forester.util.AsciiHistogram;
import org.forester.util.DescriptiveStatistics;
+import org.forester.util.ForesterConstants;
import org.forester.util.ForesterUtil;
+import org.forester.util.SequenceIdParser;
import org.forester.ws.seqdb.UniProtTaxonomy;
public final class AptxUtil {
- final static String UNIPROT_KB = "http://www.uniprot.org/uniprot/";
- final static Pattern UNIPROT_KB_PATTERN_1 = Pattern
- .compile( "\\b(sp|tr)\\W([A-Z0-9]{5,6})\\b" );
-
- final static Pattern UNIPROT_KB_PATTERN_2 = Pattern
- .compile( "\\b[A-Z0-9]{5,6}_[A-Z9][A-Z]{2}[A-Z0-9]{2}|RAT|PIG|PEA\\b" );
private final static Pattern seq_identifier_pattern_1 = Pattern
.compile( "^([A-Za-z]{2,5})[|=:]([0-9A-Za-z_\\.]{5,40})\\s*$" );
private final static Pattern seq_identifier_pattern_2 = Pattern
Arrays.sort( AVAILABLE_FONT_FAMILIES_SORTED );
}
+ public final static String createUriForSeqWeb( final PhylogenyNode node,
+ final Configuration conf,
+ final TreePanel tp ) {
+ String uri_str = null;
+ if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
+ && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+ && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+ && conf.isHasWebLink( node.getNodeData().getSequence().getAccession().getSource().toLowerCase() ) ) {
+ final Sequence seq = node.getNodeData().getSequence();
+ final String source = seq.getAccession().getSource().toLowerCase();
+ String url;
+ if ( source.toLowerCase().equals( "ncbi" ) ) {
+ url = Constants.NCBI_ALL_DATABASE_SEARCH;
+ }
+ else {
+ final WebLink weblink = conf.getWebLink( source );
+ url = weblink.getUrl().toString();
+ }
+ try {
+ uri_str = url + URLEncoder.encode( seq.getAccession().getValue(), ForesterConstants.UTF8 );
+ }
+ catch ( final UnsupportedEncodingException e ) {
+ showErrorMessage( tp, e.toString() );
+ e.printStackTrace();
+ }
+ }
+ if ( ForesterUtil.isEmpty( uri_str ) ) {
+ final String upkb = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
+ if ( !ForesterUtil.isEmpty( upkb ) ) {
+ try {
+ uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 );
+ }
+ catch ( final UnsupportedEncodingException e ) {
+ showErrorMessage( tp, e.toString() );
+ e.printStackTrace();
+ }
+ }
+ }
+ if ( ForesterUtil.isEmpty( uri_str ) ) {
+ final String v = ForesterUtil.extractGenbankAccessor( node );
+ if ( !ForesterUtil.isEmpty( v ) ) {
+ try {
+ if ( SequenceIdParser.isProtein( v ) ) {
+ uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
+ }
+ else {
+ uri_str = ForesterUtil.NCBI_NUCCORE + URLEncoder.encode( v, ForesterConstants.UTF8 );
+ }
+ }
+ catch ( final UnsupportedEncodingException e ) {
+ showErrorMessage( tp, e.toString() );
+ e.printStackTrace();
+ }
+ }
+ }
+ if ( ForesterUtil.isEmpty( uri_str ) ) {
+ final String v = ForesterUtil.extractRefSeqAccessorAccessor( node );
+ if ( !ForesterUtil.isEmpty( v ) ) {
+ try {
+ if ( SequenceIdParser.isProtein( v ) ) {
+ uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
+ }
+ else {
+ uri_str = ForesterUtil.NCBI_NUCCORE + URLEncoder.encode( v, ForesterConstants.UTF8 );
+ }
+ }
+ catch ( final UnsupportedEncodingException e ) {
+ showErrorMessage( tp, e.toString() );
+ e.printStackTrace();
+ }
+ }
+ }
+ return uri_str;
+ }
+
public static MaskFormatter createMaskFormatter( final String s ) {
MaskFormatter formatter = null;
try {
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
-import java.util.regex.Matcher;
import javax.swing.BorderFactory;
import javax.swing.JApplet;
final String title = clickto_names.get( i );
_node_popup_menu_items[ i ] = new JMenuItem( title );
if ( title.equals( Configuration.clickto_options[ Configuration.open_seq_web ][ 0 ] ) ) {
- _node_popup_menu_items[ i ].setEnabled( isCanOpenSeqWeb( node ) );
+ final String id = isCanOpenSeqWeb( node );
+ if ( !ForesterUtil.isEmpty( id ) ) {
+ _node_popup_menu_items[ i ].setText( _node_popup_menu_items[ i ].getText() + " [" + id + "]" );
+ _node_popup_menu_items[ i ].setEnabled( true );
+ }
+ else {
+ _node_popup_menu_items[ i ].setEnabled( false );
+ }
}
else if ( title.equals( Configuration.clickto_options[ Configuration.open_tax_web ][ 0 ] ) ) {
_node_popup_menu_items[ i ].setEnabled( isCanOpenTaxWeb( node ) );
}
}
- final private boolean isCanOpenSeqWeb( final PhylogenyNode node ) {
+ final private String isCanOpenSeqWeb( final PhylogenyNode node ) {
if ( node.getNodeData().isHasSequence()
&& ( node.getNodeData().getSequence().getAccession() != null )
&& !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
&& !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
&& getConfiguration().isHasWebLink( node.getNodeData().getSequence().getAccession().getSource()
.toLowerCase() ) ) {
- return true;
+ return node.getNodeData().getSequence().getAccession().getSource();
}
- if ( !ForesterUtil.isEmpty( node.getName() )
- && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( node.getName() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
- .matcher( node.getName() ).find() ) ) {
- return true;
+ String v = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
+ if ( ForesterUtil.isEmpty( v ) ) {
+ v = ForesterUtil.extractGenbankAccessor( node );
}
- if ( node.getNodeData().isHasSequence() ) {
- Sequence seq = node.getNodeData().getSequence();
- if ( !ForesterUtil.isEmpty( seq.getName() )
- && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getName() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
- .matcher( seq.getName() ).find() ) ) {
- return true;
- }
- if ( !ForesterUtil.isEmpty( seq.getSymbol() )
- && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
- .matcher( seq.getSymbol() ).find() ) ) {
- return true;
- }
- if ( ( node.getNodeData().getSequence().getAccession() != null )
- && !ForesterUtil.isEmpty( seq.getAccession().getValue() )
- && ( AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() ).find() || AptxUtil.UNIPROT_KB_PATTERN_2
- .matcher( seq.getAccession().getValue() ).find() ) ) {
- return true;
- }
+ if ( ForesterUtil.isEmpty( v ) ) {
+ v = ForesterUtil.extractRefSeqAccessorAccessor( node );
}
- return false;
+ return v;
}
final private void openSeqWeb( final PhylogenyNode node ) {
- if ( !isCanOpenSeqWeb( node ) ) {
+ if ( ForesterUtil.isEmpty( isCanOpenSeqWeb( node ) ) ) {
cannotOpenBrowserWarningMessage( "sequence" );
return;
}
- String uri_str = null;
- if ( node.getNodeData().isHasSequence()
- && ( node.getNodeData().getSequence().getAccession() != null )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
- && getConfiguration().isHasWebLink( node.getNodeData().getSequence().getAccession().getSource()
- .toLowerCase() ) ) {
- final Sequence seq = node.getNodeData().getSequence();
- final String source = seq.getAccession().getSource().toLowerCase();
- String url;
- if ( source.toLowerCase().equals( "ncbi" ) ) {
- url = Constants.NCBI_ALL_DATABASE_SEARCH;
- }
- else {
- final WebLink weblink = getConfiguration().getWebLink( source );
- url = weblink.getUrl().toString();
- }
- try {
- uri_str = url + URLEncoder.encode( seq.getAccession().getValue(), ForesterConstants.UTF8 );
- }
- catch ( final UnsupportedEncodingException e ) {
- AptxUtil.showErrorMessage( this, e.toString() );
- e.printStackTrace();
- }
- }
- else {
- String upkb = null;
- if ( node.getNodeData().isHasSequence() ) {
- Sequence seq = node.getNodeData().getSequence();
- Matcher m;
- if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
- m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() );
- if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {
- if ( m.find() ) {
- upkb = m.group( 2 );
- }
- else {
- m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- }
- }
- if ( ForesterUtil.isEmpty( upkb ) && !ForesterUtil.isEmpty( seq.getName() ) ) {
- m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getName() );
- if ( m.find() ) {
- upkb = m.group( 2 );
- }
- else {
- m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getName() );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- }
- if ( ForesterUtil.isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null )
- && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {
- m = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() );
- if ( m.find() ) {
- upkb = m.group( 2 );
- }
- else {
- m = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- }
- }
- if ( ForesterUtil.isEmpty( upkb ) && !ForesterUtil.isEmpty( node.getName() ) ) {
- final Matcher m1 = AptxUtil.UNIPROT_KB_PATTERN_1.matcher( node.getName() );
- if ( m1.find() ) {
- upkb = m1.group( 2 );
- }
- else {
- final Matcher m2 = AptxUtil.UNIPROT_KB_PATTERN_2.matcher( node.getName() );
- if ( m2.find() ) {
- upkb = m2.group();
- }
- }
- }
- try {
- uri_str = AptxUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 );
- }
- catch ( final UnsupportedEncodingException e ) {
- AptxUtil.showErrorMessage( this, e.toString() );
- e.printStackTrace();
- }
- }
+ final String uri_str = AptxUtil.createUriForSeqWeb( node, getConfiguration(), this );
if ( !ForesterUtil.isEmpty( uri_str ) ) {
try {
AptxUtil.launchWebBrowser( new URI( uri_str ),
}
else if ( !ForesterUtil.isEmpty( tax.getScientificName() ) ) {
try {
- uri_str = "http://www.eol.org/search?q="
+ uri_str = "http://www.uniprot.org/taxonomy/?query="
+ URLEncoder.encode( tax.getScientificName(), ForesterConstants.UTF8 );
}
catch ( final UnsupportedEncodingException e ) {
}
else if ( !ForesterUtil.isEmpty( tax.getCommonName() ) ) {
try {
- uri_str = "http://www.eol.org/search?q="
+ uri_str = "http://www.uniprot.org/taxonomy/?query="
+ URLEncoder.encode( tax.getCommonName(), ForesterConstants.UTF8 );
}
catch ( final UnsupportedEncodingException e ) {
}
if ( !ForesterUtil.isEmpty( uri_str ) ) {
try {
- JApplet applet = null;
- if ( isApplet() ) {
- applet = obtainApplet();
- }
- AptxUtil.launchWebBrowser( new URI( uri_str ), isApplet(), applet, "_aptx_tax" );
+ AptxUtil.launchWebBrowser( new URI( uri_str ),
+ isApplet(),
+ isApplet() ? obtainApplet() : null,
+ "_aptx_tax" );
}
catch ( final IOException e ) {
AptxUtil.showErrorMessage( this, e.toString() );
import java.util.Set;
import org.forester.application.support_transfer;
+import org.forester.archaeopteryx.AptxUtil;
import org.forester.development.DevelopmentTools;
import org.forester.evoinference.TestPhylogenyReconstruction;
import org.forester.evoinference.matrix.character.CharacterStateMatrix;
import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyNode;
import org.forester.phylogeny.PhylogenyNode.NH_CONVERSION_SUPPORT_VALUE_STYLE;
+import org.forester.phylogeny.data.Accession;
import org.forester.phylogeny.data.BinaryCharacters;
import org.forester.phylogeny.data.BranchWidth;
import org.forester.phylogeny.data.Confidence;
System.out.println( "failed." );
failed++;
}
+ System.out.print( "UniProtKB id extraction: " );
+ if ( Test.testExtractUniProtKbProteinSeqIdentifier() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
+ System.out.print( "Uri for Aptx web sequence accession: " );
+ if ( Test.testCreateUriForSeqWeb() ) {
+ System.out.println( "OK." );
+ succeeded++;
+ }
+ else {
+ System.out.println( "failed." );
+ failed++;
+ }
System.out.print( "Basic node construction and parsing of NHX (node level): " );
if ( Test.testNHXNodeParsing() ) {
System.out.println( "OK." );
}
}
+ private static boolean testExtractUniProtKbProteinSeqIdentifier() {
+ try {
+ PhylogenyNode n = new PhylogenyNode();
+ n.setName( "tr|B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "tr.B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "tr=B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "tr-B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "tr/B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "tr\\B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "tr_B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( " tr|B3RJ64 " );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "-tr|B3RJ64-" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "-tr=B3RJ64-" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "_tr=B3RJ64_" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( " tr_tr|B3RJ64_sp|123 " );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "sp|B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n.setName( "ssp|B3RJ64" );
+ if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ return false;
+ }
+ n.setName( "sp|B3RJ64C" );
+ if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ return false;
+ }
+ n.setName( "sp B3RJ64" );
+ if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ return false;
+ }
+ n.setName( "sp|B3RJ6X" );
+ if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ return false;
+ }
+ n.setName( "sp|B3RJ6" );
+ if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ return false;
+ }
+ n.setName( "K1PYK7_CRAGI" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ return false;
+ }
+ n.setName( "K1PYK7_PEA" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) {
+ return false;
+ }
+ n.setName( "K1PYK7_RAT" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) {
+ return false;
+ }
+ n.setName( "K1PYK7_PIG" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+ return false;
+ }
+ n.setName( "~K1PYK7_PIG~" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+ return false;
+ }
+ n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ return false;
+ }
+ n.setName( "K1PYKX_CRAGI" );
+ if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ return false;
+ }
+ n.setName( "XXXXX_CRAGI" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) {
+ return false;
+ }
+ n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) {
+ return false;
+ }
+ n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" );
+ if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ return false;
+ }
+ n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) {
+ return false;
+ }
+ n = new PhylogenyNode();
+ org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence();
+ seq.setSymbol( "K1PYK7_CRAGI" );
+ n.getNodeData().addSequence( seq );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ return false;
+ }
+ seq.setSymbol( "tr|B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n = new PhylogenyNode();
+ seq = new org.forester.phylogeny.data.Sequence();
+ seq.setName( "K1PYK7_CRAGI" );
+ n.getNodeData().addSequence( seq );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ return false;
+ }
+ seq.setName( "tr|B3RJ64" );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ n = new PhylogenyNode();
+ seq = new org.forester.phylogeny.data.Sequence();
+ seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) );
+ n.getNodeData().addSequence( seq );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) {
+ return false;
+ }
+ n = new PhylogenyNode();
+ seq = new org.forester.phylogeny.data.Sequence();
+ seq.setAccession( new Accession( "tr|B3RJ64", "?" ) );
+ n.getNodeData().addSequence( seq );
+ if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ return false;
+ }
+ //
+ n = new PhylogenyNode();
+ n.setName( "ACP19736" );
+ if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+ return false;
+ }
+ n = new PhylogenyNode();
+ n.setName( "_ACP19736_" );
+ if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+ return false;
+ }
+ }
+ catch ( final Exception e ) {
+ e.printStackTrace( System.out );
+ return false;
+ }
+ return true;
+ }
+
+ private static boolean testCreateUriForSeqWeb() {
+ try {
+ final PhylogenyNode n = new PhylogenyNode();
+ n.setName( "tr|B3RJ64" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "B3RJ64" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+ System.exit( -1 );
+ return false;
+ }
+ n.setName( "B0LM41_HUMAN" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "B0LM41_HUMAN" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+ System.exit( -1 );
+ return false;
+ }
+ n.setName( "NP_001025424" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "NP_001025424" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+ System.exit( -1 );
+ return false;
+ }
+ n.setName( "_NM_001030253-" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_NUCCORE + "NM_001030253" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+ System.exit( -1 );
+ return false;
+ }
+ n.setName( "NP_001025424" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "NP_001025424" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+ System.exit( -1 );
+ return false;
+ }
+ }
+ catch ( final Exception e ) {
+ e.printStackTrace( System.out );
+ return false;
+ }
+ return true;
+ }
+
private static boolean testExtractTaxonomyCodeFromNodeName() {
try {
if ( !ParserUtils.extractTaxonomyCodeFromNodeName( "MOUSE", TAXONOMY_EXTRACTION.PFAM_STYLE_RELAXED )
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.forester.phylogeny.PhylogenyNode;
public static final NumberFormat FORMATTER_6;
public static final NumberFormat FORMATTER_06;
public static final NumberFormat FORMATTER_3;
+ public static final String NCBI_PROTEIN = "http://www.ncbi.nlm.nih.gov/protein/";
+ public static final String NCBI_NUCCORE = "http://www.ncbi.nlm.nih.gov/nuccore/";
+ public final static String UNIPROT_KB = "http://www.uniprot.org/uniprot/";
+ public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern
+ .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
+ public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern
+ .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" );
static {
final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
dfs.setDecimalSeparator( '.' );
private ForesterUtil() {
}
+ public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) {
+ String v = null;
+ if ( node.getNodeData().isHasSequence() ) {
+ final Sequence seq = node.getNodeData().getSequence();
+ if ( !isEmpty( seq.getSymbol() ) ) {
+ v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() );
+ }
+ if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
+ v = SequenceIdParser.parseRefSeqAccessor( seq.getName() );
+ }
+ if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
+ && !isEmpty( seq.getAccession().getValue() ) ) {
+ v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() );
+ }
+ }
+ if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
+ v = SequenceIdParser.parseRefSeqAccessor( node.getName() );
+ }
+ return v;
+ }
+
+ public static String extractGenbankAccessor( final PhylogenyNode node ) {
+ String v = null;
+ if ( node.getNodeData().isHasSequence() ) {
+ final Sequence seq = node.getNodeData().getSequence();
+ if ( !isEmpty( seq.getSymbol() ) ) {
+ v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() );
+ }
+ if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
+ v = SequenceIdParser.parseGenbankAccessor( seq.getName() );
+ }
+ if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
+ && !isEmpty( seq.getAccession().getValue() ) ) {
+ v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() );
+ }
+ }
+ if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
+ v = SequenceIdParser.parseGenbankAccessor( node.getName() );
+ }
+ return v;
+ }
+
+ public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
+ String upkb = null;
+ if ( node.getNodeData().isHasSequence() ) {
+ final Sequence seq = node.getNodeData().getSequence();
+ Matcher m;
+ if ( !isEmpty( seq.getSymbol() ) ) {
+ m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() );
+ if ( m.find() ) {
+ upkb = m.group( 1 );
+ }
+ else {
+ m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() );
+ if ( m.find() ) {
+ upkb = m.group();
+ }
+ }
+ }
+ if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) {
+ m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() );
+ if ( m.find() ) {
+ upkb = m.group( 1 );
+ }
+ else {
+ m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() );
+ if ( m.find() ) {
+ upkb = m.group();
+ }
+ }
+ }
+ if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null )
+ && !isEmpty( seq.getAccession().getValue() ) ) {
+ m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() );
+ if ( m.find() ) {
+ upkb = m.group( 1 );
+ }
+ else {
+ m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() );
+ if ( m.find() ) {
+ upkb = m.group();
+ }
+ }
+ }
+ }
+ if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) {
+ final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() );
+ if ( m1.find() ) {
+ upkb = m1.group( 1 );
+ }
+ else {
+ final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() );
+ if ( m2.find() ) {
+ upkb = m2.group();
+ }
+ }
+ }
+ return upkb;
+ }
+
final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) {
if ( sb.length() > 0 ) {
sb.append( separator );
* Returns null if no match.\r
* \r
*/\r
- private final static String parseRefSeqAccessor( final String query ) {\r
+ public final static String parseRefSeqAccessor( final String query ) {\r
final Matcher m = REFSEQ_PATTERN.matcher( query );\r
if ( m.lookingAt() ) {\r
return m.group( 1 );\r