import org.forester.phylogeny.PhylogenyMethods;
import org.forester.phylogeny.PhylogenyMethods.DESCENDANT_SORT_PRIORITY;
import org.forester.phylogeny.PhylogenyNode;
+import org.forester.phylogeny.data.Accession;
import org.forester.phylogeny.data.Annotation;
import org.forester.phylogeny.data.BranchColor;
import org.forester.phylogeny.data.Confidence;
import org.forester.util.DescriptiveStatistics;
import org.forester.util.ForesterConstants;
import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
public final class TreePanel extends JPanel implements ActionListener, MouseWheelListener, Printable {
}
}
if ( type == '?' ) {
- if ( SequenceIdParser.isProtein( query ) ) {
+ if ( SequenceAccessionTools.isProtein( query ) ) {
type = 'p';
}
else {
}
final private String isCanOpenSeqWeb( final PhylogenyNode node ) {
- String v = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
- if ( ForesterUtil.isEmpty( v ) ) {
- v = ForesterUtil.extractGenbankAccessor( node );
+ final Accession a = SequenceAccessionTools.parse( node );
+ if ( a != null ) {
+ return a.getValue();
}
- if ( ForesterUtil.isEmpty( v ) ) {
- v = ForesterUtil.extractRefSeqAccessorAccessor( node );
- }
- if ( ForesterUtil.isEmpty( v ) ) {
- v = ForesterUtil.extractGInumber( node );
- }
- return v;
+ return null;
}
final private boolean isCanOpenTaxWeb( final PhylogenyNode node ) {
import org.forester.phylogeny.iterators.PreorderTreeIterator;
import org.forester.util.ForesterConstants;
import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
import org.forester.ws.seqdb.UniProtTaxonomy;
public class TreePanelUtil {
final Configuration conf,
final TreePanel tp ) {
String uri_str = null;
- final String upkb = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node );
+ final String upkb = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node );
if ( !ForesterUtil.isEmpty( upkb ) ) {
try {
uri_str = ForesterUtil.UNIPROT_KB + URLEncoder.encode( upkb, ForesterConstants.UTF8 );
}
}
if ( ForesterUtil.isEmpty( uri_str ) ) {
- final String v = ForesterUtil.extractGenbankAccessor( node );
+ final String v = SequenceAccessionTools.extractGenbankAccessor( node );
if ( !ForesterUtil.isEmpty( v ) ) {
try {
- if ( SequenceIdParser.isProtein( v ) ) {
+ if ( SequenceAccessionTools.isProtein( v ) ) {
uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
}
else {
}
}
if ( ForesterUtil.isEmpty( uri_str ) ) {
- final String v = ForesterUtil.extractRefSeqAccessorAccessor( node );
+ final String v = SequenceAccessionTools.extractRefSeqAccessor( node );
if ( !ForesterUtil.isEmpty( v ) ) {
try {
- if ( SequenceIdParser.isProtein( v ) ) {
+ if ( SequenceAccessionTools.isProtein( v ) ) {
uri_str = ForesterUtil.NCBI_PROTEIN + URLEncoder.encode( v, ForesterConstants.UTF8 );
}
else {
}
}
if ( ForesterUtil.isEmpty( uri_str ) ) {
- final String v = ForesterUtil.extractGInumber( node );
+ final String v = SequenceAccessionTools.extractGInumber( node );
if ( !ForesterUtil.isEmpty( v ) ) {
try {
uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 );
import org.forester.archaeopteryx.AptxUtil;
import org.forester.archaeopteryx.TreePanel;
import org.forester.phylogeny.PhylogenyNode;
-import org.forester.phylogeny.data.Identifier;
+import org.forester.phylogeny.data.Accession;
import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
import org.forester.ws.wabi.RestUtil;
public final class Blast {
}
if ( ForesterUtil.isEmpty( query ) && ( node.getNodeData().getSequence().getAccession() != null )
&& !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {
- final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getAccession()
+ final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getAccession()
.getValue() );
if ( id != null ) {
query = id.getValue();
}
}
if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getName() ) ) {
- final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() );
+ final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() );
if ( id != null ) {
query = id.getValue();
}
}
if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getSymbol() ) ) {
- final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() );
+ final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() );
if ( id != null ) {
query = id.getValue();
}
}
if ( ForesterUtil.isEmpty( query )
&& !ForesterUtil.isEmpty( node.getNodeData().getSequence().getGeneName() ) ) {
- final Identifier id = SequenceIdParser.parse( node.getNodeData().getSequence().getGeneName() );
+ final Accession id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() );
if ( id != null ) {
query = id.getValue();
}
}
}
if ( ForesterUtil.isEmpty( query ) && !ForesterUtil.isEmpty( node.getName() ) ) {
- final Identifier id = SequenceIdParser.parse( node.getName() );
+ final Accession id = SequenceAccessionTools.parse( node.getName() );
if ( id != null ) {
query = id.getValue();
}
public final class Accession implements PhylogenyData, Comparable<Accession> {
- final private String _comment;
- final private String _source;
- final private String _source_value;
- final private String _value;
+ final private String _comment;
+ final private String _source;
+ final private String _source_value;
+ final private String _value;
+ final public static String NCBI = "ncbi";
+ final public static String REFSEQ = "refseq";
+ final public static String UNIPROT = "uniprot";
+ final public static String GI = "gi";
+ public static final String EMBL = "embl";
+
+ public Accession( final String value ) {
+ _value = value;
+ _source = "";
+ _comment = "";
+ _source_value = value;
+ }
public Accession( final String value, final String source ) {
_value = value;
public final class Identifier implements PhylogenyData {
- final public static String NCBI = "ncbi";
- final public static String REFSEQ = "refseq";
- final public static String SP = "sp";
- final private String _value;
- final private String _provider;
- final private String _value_provider;
+ final private String _value;
+ final private String _provider;
+ final private String _value_provider;
public Identifier() {
_value = "";
import java.util.TreeSet;
import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
class PrintableSpeciesSpecificDcData implements SpeciesSpecificDcData {
private static String obtainSeqLink( final String p ) {
String link;
- final String up_id = ForesterUtil.extractUniProtKbProteinSeqIdentifier( p );
+ final String up_id = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( p );
if ( !ForesterUtil.isEmpty( up_id ) ) {
link = "<a class=\"pl\" href=\"" + ForesterUtil.UNIPROT_KB + up_id + "\" target=\"_up_window\">" + up_id
+ "</a>";
}
else {
- final String gb_id = SequenceIdParser.parseGenbankProteinAccessor( p );
+ final String gb_id = SequenceAccessionTools.parseGenbankProteinAccessor( p );
if ( !ForesterUtil.isEmpty( gb_id ) ) {
link = "<a class=\"pl\" href=\"" + ForesterUtil.NCBI_PROTEIN + gb_id + "\" target=\"_up_window\">"
+ gb_id + "</a>";
}
else {
- final String gi = SequenceIdParser.parseGInumber( p );
+ final String gi = SequenceAccessionTools.parseGInumber( p );
if ( !ForesterUtil.isEmpty( gi ) ) {
link = "<a class=\"pl\" href=\"" + ForesterUtil.NCBI_GI + gi + "\" target=\"_up_window\">gi|" + gi
+ "</a>";
import org.forester.util.ForesterConstants;
import org.forester.util.ForesterUtil;
import org.forester.util.GeneralTable;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
import org.forester.ws.seqdb.SequenceDatabaseEntry;
import org.forester.ws.seqdb.SequenceDbWsTools;
import org.forester.ws.seqdb.UniProtTaxonomy;
if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) {
return false;
}
- n.setName( "j40f4_Q06891.1_fndn2 fnr3" );
- if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891.1" ) ) {
+ n.setName( "AAA34956" );
+ if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) {
return false;
}
n.setName( "GI:394892" );
System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) );
return false;
}
+ n.setName( "P12345" );
+ if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "P12345" ) ) {
+ System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) );
+ return false;
+ }
+ n.setName( "gi_fdgjmn-3jk5-243 mnefmn fg023-0 P12345 4395jtmnsrg02345m1ggi92450jrg890j4t0j240" );
+ if ( !TreePanelUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.UNIPROT_KB + "P12345" ) ) {
+ System.out.println( TreePanelUtil.createUriForSeqWeb( n, null, null ) );
+ return false;
+ }
}
catch ( final Exception e ) {
e.printStackTrace( System.out );
//Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals
//Protein: 3 letters + 5 numerals
//http://www.ncbi.nlm.nih.gov/Sequin/acc.html
- if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
+ if ( !SequenceAccessionTools.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
return false;
}
- if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) {
+ if ( !SequenceAccessionTools.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) {
return false;
}
- if ( !SequenceIdParser.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
+ if ( !SequenceAccessionTools.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
return false;
}
- if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) {
+ if ( SequenceAccessionTools.parseGenbankAccessor( "AAY423861" ) != null ) {
return false;
}
- if ( SequenceIdParser.parseGenbankAccessor( "AY4238612" ) != null ) {
+ if ( SequenceAccessionTools.parseGenbankAccessor( "AY4238612" ) != null ) {
return false;
}
- if ( SequenceIdParser.parseGenbankAccessor( "AAY4238612" ) != null ) {
+ if ( SequenceAccessionTools.parseGenbankAccessor( "AAY4238612" ) != null ) {
return false;
}
- if ( SequenceIdParser.parseGenbankAccessor( "Y423861" ) != null ) {
+ if ( SequenceAccessionTools.parseGenbankAccessor( "Y423861" ) != null ) {
return false;
}
- if ( !SequenceIdParser.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) {
+ if ( !SequenceAccessionTools.parseGenbankAccessor( "S12345" ).equals( "S12345" ) ) {
return false;
}
- if ( !SequenceIdParser.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) {
+ if ( !SequenceAccessionTools.parseGenbankAccessor( "|S12345|" ).equals( "S12345" ) ) {
return false;
}
- if ( SequenceIdParser.parseGenbankAccessor( "|S123456" ) != null ) {
+ if ( SequenceAccessionTools.parseGenbankAccessor( "|S123456" ) != null ) {
return false;
}
- if ( SequenceIdParser.parseGenbankAccessor( "ABC123456" ) != null ) {
+ if ( SequenceAccessionTools.parseGenbankAccessor( "ABC123456" ) != null ) {
return false;
}
- if ( !SequenceIdParser.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) {
+ if ( !SequenceAccessionTools.parseGenbankAccessor( "ABC12345" ).equals( "ABC12345" ) ) {
return false;
}
- if ( !SequenceIdParser.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) {
+ if ( !SequenceAccessionTools.parseGenbankAccessor( "&ABC12345&" ).equals( "ABC12345" ) ) {
return false;
}
- if ( SequenceIdParser.parseGenbankAccessor( "ABCD12345" ) != null ) {
+ if ( SequenceAccessionTools.parseGenbankAccessor( "ABCD12345" ) != null ) {
return false;
}
return true;
try {
PhylogenyNode n = new PhylogenyNode();
n.setName( "tr|B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "tr.B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "tr=B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "tr-B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "tr/B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "tr\\B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "tr_B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( " tr|B3RJ64 " );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "-tr|B3RJ64-" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "-tr=B3RJ64-" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "_tr=B3RJ64_" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( " tr_tr|B3RJ64_sp|123 " );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
- n.setName( "sp|B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ n.setName( "B3RJ64" );
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
- n.setName( "ssp|B3RJ64" );
- if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ n.setName( "sp|B3RJ64" );
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "sp|B3RJ64C" );
- if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
return false;
}
n.setName( "sp B3RJ64" );
- if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n.setName( "sp|B3RJ6X" );
- if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
return false;
}
n.setName( "sp|B3RJ6" );
- if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
return false;
}
n.setName( "K1PYK7_CRAGI" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
return false;
}
n.setName( "K1PYK7_PEA" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PEA" ) ) {
return false;
}
n.setName( "K1PYK7_RAT" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_RAT" ) ) {
return false;
}
n.setName( "K1PYK7_PIG" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
return false;
}
n.setName( "~K1PYK7_PIG~" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_PIG" ) ) {
return false;
}
n.setName( "123456_ECOLI-K1PYK7_CRAGI-sp" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
return false;
}
n.setName( "K1PYKX_CRAGI" );
- if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
return false;
}
n.setName( "XXXXX_CRAGI" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "XXXXX_CRAGI" ) ) {
return false;
}
n.setName( "tr|H3IB65|H3IB65_STRPU~2-2" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "H3IB65" ) ) {
return false;
}
n.setName( "jgi|Lacbi2|181470|Lacbi1.estExt_GeneWisePlus_human.C_10729~2-3" );
- if ( ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
+ if ( SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ) != null ) {
return false;
}
n.setName( "sp|Q86U06|RBM23_HUMAN~2-2" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "Q86U06" ) ) {
return false;
}
n = new PhylogenyNode();
org.forester.phylogeny.data.Sequence seq = new org.forester.phylogeny.data.Sequence();
seq.setSymbol( "K1PYK7_CRAGI" );
n.getNodeData().addSequence( seq );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
return false;
}
seq.setSymbol( "tr|B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n = new PhylogenyNode();
seq = new org.forester.phylogeny.data.Sequence();
seq.setName( "K1PYK7_CRAGI" );
n.getNodeData().addSequence( seq );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK7_CRAGI" ) ) {
return false;
}
seq.setName( "tr|B3RJ64" );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
n = new PhylogenyNode();
seq = new org.forester.phylogeny.data.Sequence();
seq.setAccession( new Accession( "K1PYK8_CRAGI", "?" ) );
n.getNodeData().addSequence( seq );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "K1PYK8_CRAGI" ) ) {
return false;
}
n = new PhylogenyNode();
seq = new org.forester.phylogeny.data.Sequence();
seq.setAccession( new Accession( "tr|B3RJ64", "?" ) );
n.getNodeData().addSequence( seq );
- if ( !ForesterUtil.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
+ if ( !SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( n ).equals( "B3RJ64" ) ) {
return false;
}
//
n = new PhylogenyNode();
n.setName( "ACP19736" );
- if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+ if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
return false;
}
n = new PhylogenyNode();
n.setName( "_ACP19736_" );
- if ( !ForesterUtil.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
+ if ( !SequenceAccessionTools.extractGenbankAccessor( n ).equals( "ACP19736" ) ) {
return false;
}
}
private static boolean testSequenceIdParsing() {
try {
- Identifier id = SequenceIdParser.parse( "gb_ADF31344_segmented_worms_" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) {
+ Accession id = SequenceAccessionTools.parse( "gb_ADF31344_segmented_worms_" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "segmented worms|gb_ADF31344" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) {
+ id = SequenceAccessionTools.parse( "segmented worms|gb_ADF31344" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "segmented worms gb_ADF31344 and more" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "ADF31344" ) || !id.getProvider().equals( "ncbi" ) ) {
+ id = SequenceAccessionTools.parse( "segmented worms gb_ADF31344 and more" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "ADF31344" ) || !id.getSource().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "gb_AAA96518_1" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "AAA96518" ) || !id.getProvider().equals( "ncbi" ) ) {
+ id = SequenceAccessionTools.parse( "gb_AAA96518_1" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "AAA96518" ) || !id.getSource().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "gb_EHB07727_1_rodents_" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "EHB07727" ) || !id.getProvider().equals( "ncbi" ) ) {
+ id = SequenceAccessionTools.parse( "gb_EHB07727_1_rodents_" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "EHB07727" ) || !id.getSource().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "dbj_BAF37827_1_turtles_" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "BAF37827" ) || !id.getProvider().equals( "ncbi" ) ) {
+ id = SequenceAccessionTools.parse( "dbj_BAF37827_1_turtles_" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "BAF37827" ) || !id.getSource().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "emb_CAA73223_1_primates_" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "CAA73223" ) || !id.getProvider().equals( "ncbi" ) ) {
+ id = SequenceAccessionTools.parse( "emb_CAA73223_1_primates_" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "CAA73223" ) || !id.getSource().equals( "ncbi" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "mites|ref_XP_002434188_1" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "XP_002434188" ) || !id.getProvider().equals( "refseq" ) ) {
+ id = SequenceAccessionTools.parse( "mites|ref_XP_002434188_1" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "mites_ref_XP_002434188_1_bla_XP_12345" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "XP_002434188" ) || !id.getProvider().equals( "refseq" ) ) {
+ id = SequenceAccessionTools.parse( "mites_ref_XP_002434188_1_bla_XP_12345" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "XP_002434188" ) || !id.getSource().equals( "refseq" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "P4A123" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "P4A123" ) || !id.getProvider().equals( "sp" ) ) {
+ id = SequenceAccessionTools.parse( "P4A123" );
+ if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "uniprot" ) ) {
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
}
return false;
}
//
- id = SequenceIdParser.parse( "pllf[pok P4A123_osdjfosnqo035-9233332904i000490 vf tmv x45" );
- if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getProvider() )
- || !id.getValue().equals( "P4A123" ) || !id.getProvider().equals( "sp" ) ) {
- if ( id != null ) {
- System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
- }
- return false;
- }
+ // id = SequenceAccessionTools.parse( "pllf[pok P4A123_osdjfosnqo035-9233332904i000490 vf tmv x45" );
+ // if ( ( id == null ) || ForesterUtil.isEmpty( id.getValue() ) || ForesterUtil.isEmpty( id.getSource() )
+ // || !id.getValue().equals( "P4A123" ) || !id.getSource().equals( "sp" ) ) {
+ // if ( id != null ) {
+ // System.out.println( "value =" + id.getValue() );
+ // System.out.println( "provider=" + id.getSource() );
+ // }
+ // return false;
+ // }
//
- id = SequenceIdParser.parse( "XP_12345" );
+ id = SequenceAccessionTools.parse( "XP_12345" );
if ( id != null ) {
System.out.println( "value =" + id.getValue() );
- System.out.println( "provider=" + id.getProvider() );
+ System.out.println( "provider=" + id.getSource() );
return false;
}
// lcl_91970_unknown_
public static final String NCBI_PROTEIN = "http://www.ncbi.nlm.nih.gov/protein/";
public static final String NCBI_NUCCORE = "http://www.ncbi.nlm.nih.gov/nuccore/";
public final static String UNIPROT_KB = "http://www.uniprot.org/uniprot/";
- public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern
- .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
- public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern
- .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" );
public static final String NCBI_GI = "http://www.ncbi.nlm.nih.gov/protein/gi:";
static {
final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
private ForesterUtil() {
}
- public static String extractRefSeqAccessorAccessor( final PhylogenyNode node ) {
- String v = null;
- if ( node.getNodeData().isHasSequence() ) {
- final Sequence seq = node.getNodeData().getSequence();
- if ( !isEmpty( seq.getSymbol() ) ) {
- v = SequenceIdParser.parseRefSeqAccessor( seq.getSymbol() );
- }
- if ( !isEmpty( seq.getGeneName() ) ) {
- v = SequenceIdParser.parseRefSeqAccessor( seq.getGeneName() );
- }
- if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
- v = SequenceIdParser.parseRefSeqAccessor( seq.getName() );
- }
- if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
- && !isEmpty( seq.getAccession().getValue() ) ) {
- v = SequenceIdParser.parseRefSeqAccessor( seq.getAccession().getValue() );
- }
- }
- if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
- v = SequenceIdParser.parseRefSeqAccessor( node.getName() );
- }
- return v;
- }
-
- public static String extractGenbankAccessor( final PhylogenyNode node ) {
- String v = null;
- if ( node.getNodeData().isHasSequence() ) {
- final Sequence seq = node.getNodeData().getSequence();
- if ( !isEmpty( seq.getSymbol() ) ) {
- v = SequenceIdParser.parseGenbankAccessor( seq.getSymbol() );
- }
- if ( !isEmpty( seq.getGeneName() ) ) {
- v = SequenceIdParser.parseGenbankAccessor( seq.getGeneName() );
- }
- if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
- v = SequenceIdParser.parseGenbankAccessor( seq.getName() );
- }
- if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
- && !isEmpty( seq.getAccession().getValue() ) ) {
- v = SequenceIdParser.parseGenbankAccessor( seq.getAccession().getValue() );
- }
- }
- if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
- v = SequenceIdParser.parseGenbankAccessor( node.getName() );
- }
- return v;
- }
-
- public static String extractGInumber( final PhylogenyNode node ) {
- String v = null;
- if ( node.getNodeData().isHasSequence() ) {
- final Sequence seq = node.getNodeData().getSequence();
- if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
- v = SequenceIdParser.parseGInumber( seq.getName() );
- }
- if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
- && !isEmpty( seq.getAccession().getValue() ) ) {
- v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() );
- }
- }
- if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
- v = SequenceIdParser.parseGInumber( node.getName() );
- }
- return v;
- }
-
- public static String extractUniProtKbProteinSeqIdentifier( final String str ) {
- String upkb = null;
- Matcher m = UNIPROT_KB_PATTERN_1.matcher( str );
- if ( m.find() ) {
- upkb = m.group( 1 );
- }
- else {
- m = UNIPROT_KB_PATTERN_2.matcher( str );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- return upkb;
- }
-
- public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
- String upkb = null;
- if ( node.getNodeData().isHasSequence() ) {
- final Sequence seq = node.getNodeData().getSequence();
- Matcher m;
- if ( !isEmpty( seq.getSymbol() ) ) {
- m = UNIPROT_KB_PATTERN_1.matcher( seq.getSymbol() );
- if ( m.find() ) {
- upkb = m.group( 1 );
- }
- else {
- m = UNIPROT_KB_PATTERN_2.matcher( seq.getSymbol() );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- }
- if ( isEmpty( upkb ) && !isEmpty( seq.getName() ) ) {
- m = UNIPROT_KB_PATTERN_1.matcher( seq.getName() );
- if ( m.find() ) {
- upkb = m.group( 1 );
- }
- else {
- m = UNIPROT_KB_PATTERN_2.matcher( seq.getName() );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- }
- if ( isEmpty( upkb ) && !isEmpty( seq.getGeneName() ) ) {
- m = UNIPROT_KB_PATTERN_1.matcher( seq.getGeneName() );
- if ( m.find() ) {
- upkb = m.group( 1 );
- }
- else {
- m = UNIPROT_KB_PATTERN_2.matcher( seq.getGeneName() );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- }
- if ( isEmpty( upkb ) && ( node.getNodeData().getSequence().getAccession() != null )
- && !isEmpty( seq.getAccession().getValue() ) ) {
- m = UNIPROT_KB_PATTERN_1.matcher( seq.getAccession().getValue() );
- if ( m.find() ) {
- upkb = m.group( 1 );
- }
- else {
- m = UNIPROT_KB_PATTERN_2.matcher( seq.getAccession().getValue() );
- if ( m.find() ) {
- upkb = m.group();
- }
- }
- }
- }
- if ( isEmpty( upkb ) && !isEmpty( node.getName() ) ) {
- final Matcher m1 = UNIPROT_KB_PATTERN_1.matcher( node.getName() );
- if ( m1.find() ) {
- upkb = m1.group( 1 );
- }
- else {
- final Matcher m2 = UNIPROT_KB_PATTERN_2.matcher( node.getName() );
- if ( m2.find() ) {
- upkb = m2.group();
- }
- }
- }
- return upkb;
- }
-
final public static void appendSeparatorIfNotEmpty( final StringBuffer sb, final char separator ) {
if ( sb.length() > 0 ) {
sb.append( separator );
--- /dev/null
+// $Id:\r
+// FORESTER -- software libraries and applications\r
+// for evolutionary biology research and applications.\r
+//\r
+// Copyright (C) 2008-2009 Christian M. Zmasek\r
+// Copyright (C) 2008-2009 Burnham Institute for Medical Research\r
+// Copyright (C) 2000-2001 Washington University School of Medicine\r
+// and Howard Hughes Medical Institute\r
+// Copyright (C) 2003-2007 Ethalinda K.S. Cannon\r
+// All rights reserved\r
+//\r
+// This library is free software; you can redistribute it and/or\r
+// modify it under the terms of the GNU Lesser General Public\r
+// License as published by the Free Software Foundation; either\r
+// version 2.1 of the License, or (at your option) any later version.\r
+//\r
+// This library is distributed in the hope that it will be useful,\r
+// but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
+// Lesser General Public License for more details.\r
+//\r
+// You should have received a copy of the GNU Lesser General Public\r
+// License along with this library; if not, write to the Free Software\r
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA\r
+//\r
+// Contact: phylosoft @ gmail . com\r
+// WWW: https://sites.google.com/site/cmzmasek/home/software/forester\r
+\r
+package org.forester.util;\r
+\r
+import java.util.regex.Matcher;\r
+import java.util.regex.Pattern;\r
+\r
+import org.forester.phylogeny.PhylogenyNode;\r
+import org.forester.phylogeny.data.Accession;\r
+import org.forester.phylogeny.data.Sequence;\r
+\r
+public final class SequenceAccessionTools {\r
+\r
+ public final static Pattern UNIPROT_KB_PATTERN_0 = Pattern\r
+ .compile( "\\b([A-Z][0-9][A-Z0-9]{3}[0-9])\\b" );\r
+ public final static Pattern UNIPROT_KB_PATTERN_1 = Pattern\r
+ .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );\r
+ public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern\r
+ .compile( "(?:\\b|_)(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)(?:\\b|_)" );\r
+ // gb_ADF31344_1_segmented_worms_\r
+ // gb_AAA96518_1\r
+ // gb_EHB07727_1_rodents_\r
+ // dbj_BAF37827_1_turtles_\r
+ // emb_CAA73223_1_primates_\r
+ // lcl_91970_unknown_\r
+ // mites|ref_XP_002434188_1\r
+ // ref_XP_002434188_1_mites___ticks_\r
+ // ref_NP_001121530_1_frogs___toads_\r
+ //The format for GenBank Accession numbers are:\r
+ //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals\r
+ //Protein: 3 letters + 5 numerals\r
+ //http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
+ private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+ private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+ private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
+ private final static Pattern GI_PATTERN = Pattern\r
+ .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
+ // RefSeq accession numbers can be distinguished from GenBank accessions \r
+ // by their distinct prefix format of 2 characters followed by an\r
+ // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
+ private final static Pattern REFSEQ_PATTERN = Pattern\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
+\r
+ private SequenceAccessionTools() {\r
+ // Hiding the constructor.\r
+ }\r
+\r
+ public static String extractGenbankAccessor( final PhylogenyNode node ) {\r
+ String v = null;\r
+ if ( node.getNodeData().isHasSequence() ) {\r
+ final Sequence seq = node.getNodeData().getSequence();\r
+ if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+ v = parseGenbankAccessor( seq.getSymbol() );\r
+ }\r
+ if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+ v = parseGenbankAccessor( seq.getGeneName() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+ v = parseGenbankAccessor( seq.getName() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+ && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+ v = parseGenbankAccessor( seq.getAccession().getValue() );\r
+ }\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+ v = parseGenbankAccessor( node.getName() );\r
+ }\r
+ return v;\r
+ }\r
+\r
+ public static String extractGInumber( final PhylogenyNode node ) {\r
+ String v = null;\r
+ if ( node.getNodeData().isHasSequence() ) {\r
+ final Sequence seq = node.getNodeData().getSequence();\r
+ if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+ v = parseGInumber( seq.getName() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+ && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+ v = parseGInumber( seq.getAccession().getValue() );\r
+ }\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+ v = parseGInumber( node.getName() );\r
+ }\r
+ return v;\r
+ }\r
+\r
+ public static String extractRefSeqAccessor( final PhylogenyNode node ) {\r
+ String v = null;\r
+ if ( node.getNodeData().isHasSequence() ) {\r
+ final Sequence seq = node.getNodeData().getSequence();\r
+ if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+ v = parseRefSeqAccessor( seq.getSymbol() );\r
+ }\r
+ if ( !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+ v = parseRefSeqAccessor( seq.getGeneName() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+ v = parseRefSeqAccessor( seq.getName() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+ && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+ v = parseRefSeqAccessor( seq.getAccession().getValue() );\r
+ }\r
+ }\r
+ if ( ForesterUtil.isEmpty( v ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+ v = parseRefSeqAccessor( node.getName() );\r
+ }\r
+ return v;\r
+ }\r
+\r
+ public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {\r
+ String a = null;\r
+ if ( node.getNodeData().isHasSequence() ) {\r
+ final Sequence seq = node.getNodeData().getSequence();\r
+ if ( !ForesterUtil.isEmpty( seq.getSymbol() ) ) {\r
+ a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getSymbol() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getName() ) ) {\r
+ a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getName() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( seq.getGeneName() ) ) {\r
+ a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getGeneName() );\r
+ }\r
+ if ( ForesterUtil.isEmpty( a ) && ( node.getNodeData().getSequence().getAccession() != null )\r
+ && !ForesterUtil.isEmpty( seq.getAccession().getValue() ) ) {\r
+ a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( seq.getAccession().getValue() );\r
+ }\r
+ }\r
+ if ( ForesterUtil.isEmpty( a ) && !ForesterUtil.isEmpty( node.getName() ) ) {\r
+ a = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node.getName() );\r
+ }\r
+ return a;\r
+ }\r
+\r
+ public static String extractUniProtKbProteinSeqIdentifier( final String str ) {\r
+ Matcher m = UNIPROT_KB_PATTERN_0.matcher( str );\r
+ if ( m.find() ) {\r
+ return m.group( 1 );\r
+ }\r
+ m = UNIPROT_KB_PATTERN_1.matcher( str );\r
+ if ( m.find() ) {\r
+ return m.group( 1 );\r
+ }\r
+ m = UNIPROT_KB_PATTERN_2.matcher( str );\r
+ if ( m.find() ) {\r
+ return m.group();\r
+ }\r
+ return null;\r
+ }\r
+\r
+ public final static boolean isProtein( final String query ) {\r
+ final String r1 = parseRefSeqAccessor( query );\r
+ if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {\r
+ return true;\r
+ }\r
+ final String r2 = extractUniProtKbProteinSeqIdentifier( query );\r
+ if ( !ForesterUtil.isEmpty( r2 ) ) {\r
+ return true;\r
+ }\r
+ return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();\r
+ }\r
+\r
+ public final static Accession parse( final PhylogenyNode n ) {\r
+ String v = extractUniProtKbProteinSeqIdentifier( n );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.UNIPROT );\r
+ }\r
+ v = extractGenbankAccessor( n );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.NCBI );\r
+ }\r
+ v = extractRefSeqAccessor( n );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.REFSEQ );\r
+ }\r
+ v = extractGInumber( n );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.GI );\r
+ }\r
+ return null;\r
+ }\r
+\r
+ public final static Accession obtainFromSeqAccession( final PhylogenyNode node ) {\r
+ if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )\r
+ && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )\r
+ && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) ) {\r
+ final String source = node.getNodeData().getSequence().getAccession().getSource().toLowerCase();\r
+ final String value = node.getNodeData().getSequence().getAccession().getValue();\r
+ if ( ( source.startsWith( "uniprot" ) || source.equals( "swissprot" ) || source.equals( "trembl" ) || source\r
+ .equals( "sp" ) ) ) {\r
+ return new Accession( value, Accession.UNIPROT );\r
+ }\r
+ else if ( source.equals( "embl" ) || source.equals( "ebi" ) ) {\r
+ return new Accession( value, Accession.EMBL );\r
+ }\r
+ else if ( source.equals( "ncbi" ) || source.equals( "genbank" ) ) {\r
+ return new Accession( value, Accession.NCBI );\r
+ }\r
+ else if ( source.equals( "refseq" ) ) {\r
+ return new Accession( value, Accession.REFSEQ );\r
+ }\r
+ else if ( source.equals( "gi" ) ) {\r
+ return new Accession( value, Accession.GI );\r
+ }\r
+ }\r
+ return null;\r
+ }\r
+\r
+ /**\r
+ * Returns null if no match.\r
+ * \r
+ */\r
+ public final static Accession parse( final String s ) {\r
+ if ( !ForesterUtil.isEmpty( s ) ) {\r
+ String v = extractUniProtKbProteinSeqIdentifier( s );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.UNIPROT );\r
+ }\r
+ v = parseGenbankAccessor( s );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.NCBI );\r
+ }\r
+ v = parseRefSeqAccessor( s );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.REFSEQ );\r
+ }\r
+ v = parseGInumber( s );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Accession( v, Accession.GI );\r
+ }\r
+ }\r
+ return null;\r
+ }\r
+\r
+ /**\r
+ * Returns null if no match.\r
+ * \r
+ */\r
+ public static String parseGenbankAccessor( final String query ) {\r
+ Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ return null;\r
+ }\r
+ }\r
+ }\r
+ }\r
+\r
+ public static String parseGenbankProteinAccessor( final String query ) {\r
+ final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ else {\r
+ return null;\r
+ }\r
+ }\r
+\r
+ public static String parseGInumber( final String query ) {\r
+ final Matcher m = GI_PATTERN.matcher( query );\r
+ if ( m.find() ) {\r
+ return m.group( 1 );\r
+ }\r
+ return null;\r
+ }\r
+\r
+ /**\r
+ * Returns null if no match.\r
+ * \r
+ */\r
+ public final static String parseRefSeqAccessor( final String query ) {\r
+ final Matcher m = REFSEQ_PATTERN.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ return null;\r
+ }\r
+}\r
+++ /dev/null
-// $Id:\r
-// FORESTER -- software libraries and applications\r
-// for evolutionary biology research and applications.\r
-//\r
-// Copyright (C) 2008-2009 Christian M. Zmasek\r
-// Copyright (C) 2008-2009 Burnham Institute for Medical Research\r
-// Copyright (C) 2000-2001 Washington University School of Medicine\r
-// and Howard Hughes Medical Institute\r
-// Copyright (C) 2003-2007 Ethalinda K.S. Cannon\r
-// All rights reserved\r
-//\r
-// This library is free software; you can redistribute it and/or\r
-// modify it under the terms of the GNU Lesser General Public\r
-// License as published by the Free Software Foundation; either\r
-// version 2.1 of the License, or (at your option) any later version.\r
-//\r
-// This library is distributed in the hope that it will be useful,\r
-// but WITHOUT ANY WARRANTY; without even the implied warranty of\r
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
-// Lesser General Public License for more details.\r
-//\r
-// You should have received a copy of the GNU Lesser General Public\r
-// License along with this library; if not, write to the Free Software\r
-// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA\r
-//\r
-// Contact: phylosoft @ gmail . com\r
-// WWW: https://sites.google.com/site/cmzmasek/home/software/forester\r
-\r
-package org.forester.util;\r
-\r
-import java.util.regex.Matcher;\r
-import java.util.regex.Pattern;\r
-\r
-import org.forester.phylogeny.data.Identifier;\r
-\r
-public final class SequenceIdParser {\r
-\r
- // gb_ADF31344_1_segmented_worms_\r
- // gb_AAA96518_1\r
- // gb_EHB07727_1_rodents_\r
- // dbj_BAF37827_1_turtles_\r
- // emb_CAA73223_1_primates_\r
- // lcl_91970_unknown_\r
- // mites|ref_XP_002434188_1\r
- // ref_XP_002434188_1_mites___ticks_\r
- // ref_NP_001121530_1_frogs___toads_\r
- //The format for GenBank Accession numbers are:\r
- //Nucleotide: 1 letter + 5 numerals OR 2 letters + 6 numerals\r
- //Protein: 3 letters + 5 numerals\r
- //http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
- private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
- private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
- private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
- // RefSeq accession numbers can be distinguished from GenBank accessions \r
- // by their distinct prefix format of 2 characters followed by an\r
- // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
- private final static Pattern REFSEQ_PATTERN = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
- // See: http://web.expasy.org/docs/userman.html#ID_line\r
- private final static Pattern TREMBL_PATTERN = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z][0-9][A-Z0-9]{3}[0-9])(?:[^a-zA-Z0-9]|\\Z)" );\r
- private final static Pattern GI_PATTERN = Pattern\r
- .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
-\r
- /**\r
- * Returns null if no match.\r
- * \r
- */\r
- public final static Identifier parse( final String s ) {\r
- if ( !ForesterUtil.isEmpty( s ) ) {\r
- String v = parseGenbankAccessor( s );\r
- if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Identifier( v, Identifier.NCBI );\r
- }\r
- v = parseRefSeqAccessor( s );\r
- if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Identifier( v, Identifier.REFSEQ );\r
- }\r
- v = parseTrEMBLAccessor( s );\r
- if ( !ForesterUtil.isEmpty( v ) ) {\r
- return new Identifier( v, Identifier.SP );\r
- }\r
- }\r
- return null;\r
- }\r
-\r
- public final static boolean isProtein( final String query ) {\r
- final String r1 = parseRefSeqAccessor( query );\r
- if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {\r
- return true;\r
- }\r
- final String r2 = parseTrEMBLAccessor( query );\r
- if ( !ForesterUtil.isEmpty( r2 ) ) {\r
- return true;\r
- }\r
- return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();\r
- }\r
-\r
- /**\r
- * Returns null if no match.\r
- * \r
- */\r
- public static String parseGenbankAccessor( final String query ) {\r
- Matcher m = GENBANK_NUCLEOTIDE_AC_PATTERN_1.matcher( query );\r
- if ( m.lookingAt() ) {\r
- return m.group( 1 );\r
- }\r
- else {\r
- m = GENBANK_NUCLEOTIDE_AC_PATTERN_2.matcher( query );\r
- if ( m.lookingAt() ) {\r
- return m.group( 1 );\r
- }\r
- else {\r
- m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
- if ( m.lookingAt() ) {\r
- return m.group( 1 );\r
- }\r
- else {\r
- return null;\r
- }\r
- }\r
- }\r
- }\r
-\r
- public static String parseGenbankProteinAccessor( final String query ) {\r
- final Matcher m = GENBANK_PROTEIN_AC_PATTERN.matcher( query );\r
- if ( m.lookingAt() ) {\r
- return m.group( 1 );\r
- }\r
- else {\r
- return null;\r
- }\r
- }\r
-\r
- /**\r
- * Returns null if no match.\r
- * \r
- */\r
- public final static String parseRefSeqAccessor( final String query ) {\r
- final Matcher m = REFSEQ_PATTERN.matcher( query );\r
- if ( m.lookingAt() ) {\r
- return m.group( 1 );\r
- }\r
- return null;\r
- }\r
-\r
- /**\r
- * Returns null if no match.\r
- * \r
- */\r
- private final static String parseTrEMBLAccessor( final String query ) {\r
- final Matcher m = TREMBL_PATTERN.matcher( query );\r
- if ( m.lookingAt() ) {\r
- return m.group( 1 );\r
- }\r
- return null;\r
- }\r
-\r
- private SequenceIdParser() {\r
- // Hiding the constructor.\r
- }\r
-\r
- public static String parseGInumber( final String query ) {\r
- final Matcher m = GI_PATTERN.matcher( query );\r
- if ( m.find() ) {\r
- return m.group( 1 );\r
- }\r
- return null;\r
- }\r
-}\r
import org.forester.phylogeny.data.Taxonomy;
import org.forester.phylogeny.iterators.PhylogenyNodeIterator;
import org.forester.util.ForesterUtil;
-import org.forester.util.SequenceIdParser;
+import org.forester.util.SequenceAccessionTools;
public final class SequenceDbWsTools {
return null;
}
- public static SequenceDatabaseEntry obtainEmblEntry( final Identifier id, final int max_lines_to_return )
+ public static SequenceDatabaseEntry obtainEmblEntry( final Accession id, final int max_lines_to_return )
throws IOException {
final List<String> lines = queryEmblDb( id, max_lines_to_return );
return EbiDbEntry.createInstanceFromPlainText( lines );
}
- public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Identifier id, final int max_lines_to_return )
+ public static SequenceDatabaseEntry obtainRefSeqEntryFromEmbl( final Accession id, final int max_lines_to_return )
throws IOException {
final List<String> lines = queryEmblDb( id, max_lines_to_return );
return EbiDbEntry.createInstanceFromPlainTextForRefSeq( lines );
if ( ext_nodes_only && node.isInternal() ) {
continue;
}
- String query = null;
- Identifier id = null;
- Db db = Db.NONE;
- if ( node.getNodeData().isHasSequence()
- && ( node.getNodeData().getSequence().getAccession() != null )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
- && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
- .startsWith( "uniprot" )
- || node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
- .startsWith( "swissprot" )
- || node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
- .startsWith( "trembl" )
- || node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
- .startsWith( "sp" ) || node.getNodeData().getSequence().getAccession().getValue()
- .toLowerCase().startsWith( "uniprotkb" ) ) ) {
- query = node.getNodeData().getSequence().getAccession().getValue();
- db = Db.UNIPROT;
+ // String query = null;
+ // Accession id = null;
+ // Accession acc = SequenceAccessionTools.obtain( node );
+ //
+ //
+ // Db db = Db.NONE;
+ // if ( node.getNodeData().isHasSequence()
+ // && ( node.getNodeData().getSequence().getAccession() != null )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+ // && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase()
+ // .startsWith( "uniprot" )
+ // || node.getNodeData().getSequence().getAccession().getValue()
+ // .equalsIgnoreCase( "swissprot" )
+ // || node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "trembl" ) || node
+ // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "sp" ) ) ) {
+ // query = node.getNodeData().getSequence().getAccession().getValue();
+ // db = Db.UNIPROT;
+ // }
+ // else if ( node.getNodeData().isHasSequence()
+ // && ( node.getNodeData().getSequence().getAccession() != null )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+ // && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "embl" ) || node
+ // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ebi" ) ) ) {
+ // query = node.getNodeData().getSequence().getAccession().getValue();
+ // db = Db.EMBL;
+ // }
+ // else if ( node.getNodeData().isHasSequence()
+ // && ( node.getNodeData().getSequence().getAccession() != null )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+ // && ( node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "ncbi" ) || node
+ // .getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "genbank" ) ) ) {
+ // query = node.getNodeData().getSequence().getAccession().getValue();
+ // // db = Db.NCBI;
+ // }
+ // else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
+ // && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
+ // && node.getNodeData().getSequence().getAccession().getValue().equalsIgnoreCase( "refseq" ) ) {
+ // query = node.getNodeData().getSequence().getAccession().getValue();
+ // db = Db.REFSEQ;
+ // }
+ // else {
+ Accession acc = SequenceAccessionTools.obtainFromSeqAccession( node );
+ // if ( ( query = SequenceAccessionTools.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) {
+ // db = Db.UNIPROT;
+ // }
+ // else if ( node.getNodeData().isHasSequence() ) {
+ // if ( ( id = SequenceAccessionTools.parse( node.getName() ) ) != null ) {
+ // if ( id.getSource() == Accession.NCBI ) {
+ // // db = Db.NCBI;
+ // }
+ // else if ( id.getSource() == Accession.REFSEQ ) {
+ // db = Db.REFSEQ;
+ // }
+ // }
+ // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getName() ) ) != null ) {
+ // if ( id.getSource() == Accession.NCBI ) {
+ // // = Db.NCBI;
+ // }
+ // else if ( id.getSource() == Accession.REFSEQ ) {
+ // db = Db.REFSEQ;
+ // }
+ // }
+ // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) {
+ // if ( id.getSource() == Accession.NCBI ) {
+ // // db = Db.NCBI;
+ // }
+ // else if ( id.getSource() == Accession.REFSEQ ) {
+ // db = Db.REFSEQ;
+ // }
+ // }
+ // else if ( ( id = SequenceAccessionTools.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) {
+ // if ( id.getSource() == Accession.NCBI ) {
+ // // db = Db.NCBI;
+ // }
+ // else if ( id.getSource() == Accession.REFSEQ ) {
+ // db = Db.REFSEQ;
+ // }
+ // }
+ // }
+ // }
+ if ( ( acc == null )
+ || ForesterUtil.isEmpty( acc.getSource() )
+ || ForesterUtil.isEmpty( acc.getValue() )
+ || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc
+ .getSource() != Accession.REFSEQ ) ) ) {
+ acc = SequenceAccessionTools.parse( node );
}
- else if ( node.getNodeData().isHasSequence()
- && ( node.getNodeData().getSequence().getAccession() != null )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
- && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "embl" ) || node
- .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ebi" ) ) ) {
- query = node.getNodeData().getSequence().getAccession().getValue();
- db = Db.EMBL;
- }
- else if ( node.getNodeData().isHasSequence()
- && ( node.getNodeData().getSequence().getAccession() != null )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
- && ( node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "ncbi" ) || node
- .getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "genbank" ) ) ) {
- query = node.getNodeData().getSequence().getAccession().getValue();
- // db = Db.NCBI;
- }
- else if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() )
- && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() )
- && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "refseq" ) ) {
- query = node.getNodeData().getSequence().getAccession().getValue();
- db = Db.REFSEQ;
- }
- else {
- if ( ( query = ForesterUtil.extractUniProtKbProteinSeqIdentifier( node ) ) != null ) {
- db = Db.UNIPROT;
- }
- else if ( node.getNodeData().isHasSequence() ) {
- if ( ( id = SequenceIdParser.parse( node.getName() ) ) != null ) {
- if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
- // db = Db.NCBI;
- }
- else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
- db = Db.REFSEQ;
- }
- }
- else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getName() ) ) != null ) {
- if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
- // = Db.NCBI;
- }
- else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
- db = Db.REFSEQ;
- }
- }
- else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getGeneName() ) ) != null ) {
- if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
- // db = Db.NCBI;
- }
- else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
- db = Db.REFSEQ;
- }
- }
- else if ( ( id = SequenceIdParser.parse( node.getNodeData().getSequence().getSymbol() ) ) != null ) {
- if ( id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
- // db = Db.NCBI;
- }
- else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
- db = Db.REFSEQ;
- }
- }
- }
- }
- if ( db == Db.NONE ) {
+ if ( ( acc == null )
+ || ForesterUtil.isEmpty( acc.getSource() )
+ || ForesterUtil.isEmpty( acc.getValue() )
+ || ( ( acc.getSource() != Accession.UNIPROT ) && ( acc.getSource() != Accession.EMBL ) && ( acc
+ .getSource() != Accession.REFSEQ ) ) ) {
not_found.add( node.toString() );
}
- SequenceDatabaseEntry db_entry = null;
- if ( !ForesterUtil.isEmpty( query ) ) {
- if ( db == Db.UNIPROT ) {
+ else {
+ SequenceDatabaseEntry db_entry = null;
+ final String query = acc.getValue();
+ if ( acc.getSource() == Accession.UNIPROT ) {
if ( DEBUG ) {
System.out.println( "uniprot: " + query );
}
db_entry = obtainUniProtEntry( query, lines_to_return );
}
- else if ( db == Db.EMBL ) {
+ else if ( acc.getSource() == Accession.EMBL ) {
if ( DEBUG ) {
System.out.println( "embl: " + query );
}
- db_entry = obtainEmblEntry( new Identifier( query ), lines_to_return );
+ db_entry = obtainEmblEntry( new Accession( query ), lines_to_return );
}
- else if ( db == Db.REFSEQ ) {
+ else if ( acc.getSource() == Accession.REFSEQ ) {
if ( DEBUG ) {
System.out.println( "refseq: " + query );
}
- db_entry = obtainRefSeqEntryFromEmbl( new Identifier( query ), lines_to_return );
+ db_entry = obtainRefSeqEntryFromEmbl( new Accession( query ), lines_to_return );
}
- // else if ( db == Db.NCBI ) {
- // if ( DEBUG ) {
- // System.out.println( "ncbi: " + query );
- // }
- // db_entry = obtainNcbiEntry( new Identifier( query ), lines_to_return );
- // }
- }
- else if ( ( db == Db.REFSEQ ) && ( id != null ) ) {
- db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return );
- }
- //else if ( ( db == Db.NCBI ) && ( id != null ) ) {
- // db_entry = obtainNcbiEntry( id, lines_to_return );
- //}
- if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
- final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()
- : new Sequence();
- if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
- String type = null;
- if ( db == Db.EMBL ) {
- type = "embl";
+ if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
+ final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()
+ : new Sequence();
+ if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) {
+ seq.setAccession( new Accession( db_entry.getAccession(), acc.getSource() ) );
}
- else if ( db == Db.UNIPROT ) {
- type = "uniprot";
+ if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
+ seq.setName( db_entry.getSequenceName() );
}
- // else if ( db == Db.NCBI ) {
- // type = "ncbi";
- // }
- else if ( db == Db.REFSEQ ) {
- type = "refseq";
+ if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) {
+ seq.setGeneName( db_entry.getGeneName() );
}
- seq.setAccession( new Accession( db_entry.getAccession(), type ) );
- }
- if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) {
- seq.setName( db_entry.getSequenceName() );
- }
- if ( !ForesterUtil.isEmpty( db_entry.getGeneName() ) ) {
- seq.setGeneName( db_entry.getGeneName() );
- }
- if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
- try {
- seq.setSymbol( db_entry.getSequenceSymbol() );
+ if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) {
+ try {
+ seq.setSymbol( db_entry.getSequenceSymbol() );
+ }
+ catch ( final PhyloXmlDataFormatException e ) {
+ // Eat this exception.
+ }
}
- catch ( final PhyloXmlDataFormatException e ) {
- // Eat this exception.
+ if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) {
+ for( final GoTerm go : db_entry.getGoTerms() ) {
+ final Annotation ann = new Annotation( go.getGoId().getId() );
+ ann.setDesc( go.getName() );
+ seq.addAnnotation( ann );
+ }
}
- }
- if ( ( db_entry.getGoTerms() != null ) && !db_entry.getGoTerms().isEmpty() ) {
- for( final GoTerm go : db_entry.getGoTerms() ) {
- final Annotation ann = new Annotation( go.getGoId().getId() );
- ann.setDesc( go.getName() );
- seq.addAnnotation( ann );
+ if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) {
+ for( final Accession x : db_entry.getCrossReferences() ) {
+ seq.addCrossReference( x );
+ }
}
- }
- if ( ( db_entry.getCrossReferences() != null ) && !db_entry.getCrossReferences().isEmpty() ) {
- for( final Accession x : db_entry.getCrossReferences() ) {
- seq.addCrossReference( x );
+ final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy()
+ : new Taxonomy();
+ if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
+ tax.setScientificName( db_entry.getTaxonomyScientificName() );
+ }
+ if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
+ tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
}
+ node.getNodeData().setTaxonomy( tax );
+ node.getNodeData().setSequence( seq );
}
- final Taxonomy tax = node.getNodeData().isHasTaxonomy() ? node.getNodeData().getTaxonomy()
- : new Taxonomy();
- if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) {
- tax.setScientificName( db_entry.getTaxonomyScientificName() );
+ else {
+ not_found.add( node.getName() );
}
- if ( allow_to_set_taxonomic_data && !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) {
- tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) );
+ try {
+ Thread.sleep( 10 );// Sleep for 10 ms
+ }
+ catch ( final InterruptedException ie ) {
}
- node.getNodeData().setTaxonomy( tax );
- node.getNodeData().setSequence( seq );
- }
- else if ( db != Db.NONE ) {
- not_found.add( node.getName() );
- }
- try {
- Thread.sleep( 10 );// Sleep for 10 ms
- }
- catch ( final InterruptedException ie ) {
}
}
return not_found;
return result;
}
- public static List<String> queryEmblDb( final Identifier id, final int max_lines_to_return ) throws IOException {
+ public static List<String> queryEmblDb( final Accession id, final int max_lines_to_return ) throws IOException {
final StringBuilder url_sb = new StringBuilder();
url_sb.append( BASE_EMBL_DB_URL );
- if ( ForesterUtil.isEmpty( id.getProvider() ) || id.getProvider().equalsIgnoreCase( Identifier.NCBI ) ) {
+ if ( ForesterUtil.isEmpty( id.getSource() ) || ( id.getSource() == Accession.NCBI ) ) {
url_sb.append( SequenceDbWsTools.EMBL_DBS_EMBL );
url_sb.append( '/' );
}
- else if ( id.getProvider().equalsIgnoreCase( Identifier.REFSEQ ) ) {
+ else if ( id.getSource() == Accession.REFSEQ ) {
if ( id.getValue().toUpperCase().indexOf( 'P' ) == 1 ) {
url_sb.append( SequenceDbWsTools.EMBL_DBS_REFSEQ_P );
url_sb.append( '/' );
}
return taxonomies;
}
-
- public enum Db {
- UNIPROT, EMBL, NCBI, NONE, REFSEQ;
- }
}