From aec065f948a075773794133f102ea19eb1d59f64 Mon Sep 17 00:00:00 2001 From: "cmzmasek@gmail.com" Date: Fri, 8 Mar 2013 00:04:11 +0000 Subject: [PATCH] inprogress --- .../src/org/forester/archaeopteryx/AptxUtil.java | 12 ++++++++ .../src/org/forester/archaeopteryx/TreePanel.java | 3 ++ forester/java/src/org/forester/test/Test.java | 29 +++++++++++++++----- .../java/src/org/forester/util/ForesterUtil.java | 19 +++++++++++++ .../src/org/forester/util/SequenceIdParser.java | 16 +++++++++-- .../org/forester/ws/seqdb/SequenceDbWsTools.java | 2 +- 6 files changed, 70 insertions(+), 11 deletions(-) diff --git a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java index 8ccd028..bc7a693 100644 --- a/forester/java/src/org/forester/archaeopteryx/AptxUtil.java +++ b/forester/java/src/org/forester/archaeopteryx/AptxUtil.java @@ -150,6 +150,18 @@ public final class AptxUtil { } } } + if ( ForesterUtil.isEmpty( uri_str ) ) { + final String v = ForesterUtil.extractGInumber( node ); + if ( !ForesterUtil.isEmpty( v ) ) { + try { + uri_str = ForesterUtil.NCBI_GI + URLEncoder.encode( v, ForesterConstants.UTF8 ); + } + catch ( final UnsupportedEncodingException e ) { + showErrorMessage( tp, e.toString() ); + e.printStackTrace(); + } + } + } return uri_str; } diff --git a/forester/java/src/org/forester/archaeopteryx/TreePanel.java b/forester/java/src/org/forester/archaeopteryx/TreePanel.java index b2ec09f..533093c 100644 --- a/forester/java/src/org/forester/archaeopteryx/TreePanel.java +++ b/forester/java/src/org/forester/archaeopteryx/TreePanel.java @@ -3243,6 +3243,9 @@ public final class TreePanel extends JPanel implements ActionListener, MouseWhee if ( ForesterUtil.isEmpty( v ) ) { v = ForesterUtil.extractRefSeqAccessorAccessor( node ); } + if ( ForesterUtil.isEmpty( v ) ) { + v = ForesterUtil.extractGInumber( node ); + } return v; } diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 4a3e95b..93a623e 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -1041,12 +1041,27 @@ public final class Test { if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_NUCCORE + "XM_002122186" ) ) { return false; } - n.setName( "AAA34956" ); + n.setName( "dgh_AAA34956_gdg" ); if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "AAA34956" ) ) { return false; } - n.setName( "Q06891.1" ); - if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891" ) ) { + n.setName( "j40f4_Q06891.1_fndn2 fnr3" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_PROTEIN + "Q06891.1" ) ) { + return false; + } + n.setName( "GI:394892" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); + return false; + } + n.setName( "gi_394892" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); + return false; + } + n.setName( "gi6335_gi_394892_56635_Gi_43" ); + if ( !AptxUtil.createUriForSeqWeb( n, null, null ).equals( ForesterUtil.NCBI_GI + "394892" ) ) { + System.out.println( AptxUtil.createUriForSeqWeb( n, null, null ) ); return false; } } @@ -9376,7 +9391,10 @@ public final class Test { if ( !SequenceIdParser.parseGenbankAccessor( "AY423861" ).equals( "AY423861" ) ) { return false; } - if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861." ).equals( "AY423861" ) ) { + if ( !SequenceIdParser.parseGenbankAccessor( ".AY423861.2" ).equals( "AY423861.2" ) ) { + return false; + } + if ( !SequenceIdParser.parseGenbankAccessor( "345_.AY423861.24_345" ).equals( "AY423861.24" ) ) { return false; } if ( SequenceIdParser.parseGenbankAccessor( "AAY423861" ) != null ) { @@ -9427,9 +9445,6 @@ public final class Test { if ( !entry.getSequenceName().equals( "Aspartate aminotransferase, mitochondrial" ) ) { return false; } - if ( !entry.getSequenceSymbol().equals( "GOT2" ) ) { - return false; - } if ( !entry.getTaxonomyIdentifier().equals( "9986" ) ) { return false; } diff --git a/forester/java/src/org/forester/util/ForesterUtil.java b/forester/java/src/org/forester/util/ForesterUtil.java index 828d3df..a8c81b0 100644 --- a/forester/java/src/org/forester/util/ForesterUtil.java +++ b/forester/java/src/org/forester/util/ForesterUtil.java @@ -90,6 +90,7 @@ public final class ForesterUtil { .compile( "(?:\\b|_)(?:sp|tr)[\\.|\\-_=/\\\\]([A-Z][0-9][A-Z0-9]{3}[0-9])(?:\\b|_)" ); public final static Pattern UNIPROT_KB_PATTERN_2 = Pattern .compile( "\\b(?:[A-Z0-9]{2,5}|(?:[A-Z][0-9][A-Z0-9]{3}[0-9]))_(([A-Z9][A-Z]{2}[A-Z0-9]{2})|RAT|PIG|PEA)\\b" ); + public static final String NCBI_GI = "http://www.ncbi.nlm.nih.gov/protein/gi:"; static { final DecimalFormatSymbols dfs = new DecimalFormatSymbols(); dfs.setDecimalSeparator( '.' ); @@ -145,6 +146,24 @@ public final class ForesterUtil { return v; } + public static String extractGInumber( final PhylogenyNode node ) { + String v = null; + if ( node.getNodeData().isHasSequence() ) { + final Sequence seq = node.getNodeData().getSequence(); + if ( isEmpty( v ) && !isEmpty( seq.getName() ) ) { + v = SequenceIdParser.parseGInumber( seq.getName() ); + } + if ( isEmpty( v ) && ( node.getNodeData().getSequence().getAccession() != null ) + && !isEmpty( seq.getAccession().getValue() ) ) { + v = SequenceIdParser.parseGInumber( seq.getAccession().getValue() ); + } + } + if ( isEmpty( v ) && !isEmpty( node.getName() ) ) { + v = SequenceIdParser.parseGInumber( node.getName() ); + } + return v; + } + public static String extractUniProtKbProteinSeqIdentifier( final PhylogenyNode node ) { String upkb = null; if ( node.getNodeData().isHasSequence() ) { diff --git a/forester/java/src/org/forester/util/SequenceIdParser.java b/forester/java/src/org/forester/util/SequenceIdParser.java index 6d2dd37..d828a6a 100644 --- a/forester/java/src/org/forester/util/SequenceIdParser.java +++ b/forester/java/src/org/forester/util/SequenceIdParser.java @@ -49,11 +49,11 @@ public final class SequenceIdParser { //Protein: 3 letters + 5 numerals //http://www.ncbi.nlm.nih.gov/Sequin/acc.html private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_1 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5})(?:[^a-zA-Z0-9]|\\Z)" ); + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); private final static Pattern GENBANK_NUCLEOTIDE_AC_PATTERN_2 = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6})(?:[^a-zA-Z0-9]|\\Z)" ); + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}\\d{6}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); private final static Pattern GENBANK_PROTEIN_AC_PATTERN = Pattern - .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5})(?:[^a-zA-Z0-9]|\\Z)" ); + .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{3}\\d{5}(?:\\.\\d+)?)(?:[^a-zA-Z0-9]|\\Z)" ); // RefSeq accession numbers can be distinguished from GenBank accessions // by their distinct prefix format of 2 characters followed by an // underscore character ('_'). For example, a RefSeq protein accession is NP_015325. @@ -62,6 +62,8 @@ public final class SequenceIdParser { // See: http://web.expasy.org/docs/userman.html#ID_line private final static Pattern TREMBL_PATTERN = Pattern .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z][0-9][A-Z0-9]{3}[0-9])(?:[^a-zA-Z0-9]|\\Z)" ); + private final static Pattern GI_PATTERN = Pattern + .compile( "(?:\\b|_)(?:GI|gi)[|_=:](\\d+)(?:\\b|_)" ); /** * Returns null if no match. @@ -148,4 +150,12 @@ public final class SequenceIdParser { private SequenceIdParser() { // Hiding the constructor. } + + public static String parseGInumber( final String query ) { + final Matcher m = GI_PATTERN.matcher( query ); + if ( m.find() ) { + return m.group( 1 ); + } + return null; + } } diff --git a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java index e3b222b..7c656d5 100644 --- a/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java +++ b/forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java @@ -230,7 +230,7 @@ public final class SequenceDbWsTools { db_entry = obtainRefSeqEntryFromEmbl( id, lines_to_return ); } else if ( ( db == Db.NCBI ) && ( id != null ) ) { - db_entry = obtainEmblEntry( id, lines_to_return ); + db_entry = obtainEmblEntry( id, lines_to_return ); //TODO ? } if ( ( db_entry != null ) && !db_entry.isEmpty() ) { final Sequence seq = node.getNodeData().isHasSequence() ? node.getNodeData().getSequence() -- 1.7.10.2