}
}
}
+ if ( ForesterUtil.isEmpty( uri_str ) ) {
+ final String v = ForesterUtil.extractGInumber( node );
+ if ( !ForesterUtil.isEmpty( v ) ) {
+ try {
+ uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 );
+ }
+ catch ( final UnsupportedEncodingException e ) {
+ showErrorMessage( tp, e.toString() );
+ e.printStackTrace();
+ }
+ }
+ }
return uri_str;
}
if ( ForesterUtil.isEmpty( v ) ) {
v = ForesterUtil.extractRefSeqAccessorAccessor( node );
}
+ if ( ForesterUtil.isEmpty( v ) ) {
+ v = ForesterUtil.extractGInumber( node );
+ }
return v;
}
if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_NUCCORE + "XM_002122186" ) ) {
return false;
}
- n.setName( "AAA34956" );
+ n.setName( "dgh_AAA34956_gdg" );
if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) {
return false;
}
- n.setName( "Q06891.1" );
- if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891" ) ) {
+ n.setName( "j40f4_Q06891.1_fndn2 fnr3" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891.1" ) ) {
+ return false;
+ }
+ n.setName( "GI:394892" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+ return false;
+ }
+ n.setName( "gi_394892" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
+ return false;
+ }
+ n.setName( "gi6335_gi_394892_56635_Gi_43" );
+ if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) {
+ System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) );
return false;
}
}
if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) {
return false;
}
- if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861." ).equals( "AY423861" ) ) {
+ if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) {
+ return false;
+ }
+ if ( !SequenceIdParser.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) {
return false;
}
if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) {
if ( !entry.getSequenceName().equals( "Aspartate aminotransferase, mitochondrial" ) ) {
return false;
}
- if ( !entry.getSequenceSymbol().equals( "GOT2" ) ) {
- return false;
- }
if ( !entry.getTaxonomyIdentifier().equals( "9986" ) ) {
return false;
}
.compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" );
public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern
.compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" );
+ public static final String NCBI_GI = "http://www.ncbi.nlm.nih.gov/protein/gi:";
static {
final DecimalFormatSymbols dfs = new DecimalFormatSymbols();
dfs.setDecimalSeparator( '.' );
return v;
}
+ public static String extractGInumber( final PhylogenyNode node ) {
+ String v = null;
+ if ( node.getNodeData().isHasSequence() ) {
+ final Sequence seq = node.getNodeData().getSequence();
+ if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) {
+ v = SequenceIdParser.parseGInumber( seq.getName() );
+ }
+ if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null )
+ && !isEmpty( seq.getAccession().getValue() ) ) {
+ v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() );
+ }
+ }
+ if ( isEmpty( v ) && !isEmpty( node.getName() ) ) {
+ v = SequenceIdParser.parseGInumber( node.getName() );
+ }
+ return v;
+ }
+
public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) {
String upkb = null;
if ( node.getNodeData().isHasSequence() ) {
//Protein: 3 letters + 5 numerals\r
//http://www.ncbi.nlm.nih.gov/Sequin/acc.html\r
private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" );\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern\r
- .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" );\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" );\r
// RefSeq accession numbers can be distinguished from GenBank accessions \r
// by their distinct prefix format of 2 characters followed by an\r
// underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
// See: http://web.expasy.org/docs/userman.html#ID_line\r
private final static Pattern TREMBL_PATTERN = Pattern\r
.compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z][0-9][A-Z0-9]{3}[0-9])(?:[^a-zA-Z0-9]|\\Z)" );\r
+ private final static Pattern GI_PATTERN = Pattern\r
+ .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" );\r
\r
/**\r
* Returns null if no match.\r
private SequenceIdParser() {\r
// Hiding the constructor.\r
}\r
+\r
+ public static String parseGInumber( final String query ) {\r
+ final Matcher m = GI_PATTERN.matcher( query );\r
+ if ( m.find() ) {\r
+ return m.group( 1 );\r
+ }\r
+ return null;\r
+ }\r
}\r
db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return );
}
else if ( ( db == Db.NCBI ) && ( id != null ) ) {
- db_entry = obtainEmblEntry( id, lines_to_return );
+ db_entry = obtainEmblEntry( id, lines_to_return ); //TODO ?
}
if ( ( db_entry != null ) && !db_entry.isEmpty() ) {
final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence()