inprogress
authorcmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 4 Oct 2013 22:40:39 +0000 (22:40 +0000)
committercmzmasek@gmail.com <cmzmasek@gmail.com@ca865154-3058-d1c3-3e42-d8f55a55bdbd>
Fri, 4 Oct 2013 22:40:39 +0000 (22:40 +0000)
forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java
forester/java/src/org/forester/phylogeny/PhylogenyNode.java
forester/java/src/org/forester/test/Test.java
forester/java/src/org/forester/ws/seqdb/DatabaseTools.java [deleted file]
forester/java/src/org/forester/ws/seqdb/EbiDbEntry.java
forester/java/src/org/forester/ws/seqdb/SequenceDbWsTools.java
forester/java/src/org/forester/ws/seqdb/UniProtEntry.java

index cc6a444..fcb71fc 100644 (file)
@@ -38,11 +38,10 @@ import org.forester.ws.seqdb.SequenceDbWsTools;
 
 public final class SequenceDataRetriver extends RunnableProcess {
 
-    private final static int           DEFAULT_LINES_TO_RETURN = 4000;
     private final Phylogeny            _phy;
     private final MainFrameApplication _mf;
     private final TreePanel            _treepanel;
-    public final static boolean        DEBUG                   = false;
+    public final static boolean        DEBUG = false;
 
     public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) {
         _phy = phy;
@@ -59,7 +58,10 @@ public final class SequenceDataRetriver extends RunnableProcess {
         start( _mf, "sequence data" );
         SortedSet<String> not_found = null;
         try {
-            not_found = SequenceDbWsTools.obtainSeqInformation( _phy, false, true, DEFAULT_LINES_TO_RETURN );
+            not_found = SequenceDbWsTools.obtainSeqInformation( _phy,
+                                                                false,
+                                                                true,
+                                                                SequenceDbWsTools.DEFAULT_LINES_TO_RETURN );
         }
         catch ( final UnknownHostException e ) {
             JOptionPane.showMessageDialog( _mf,
index 2b7c2cf..6b180ca 100644 (file)
@@ -74,6 +74,15 @@ public final class PhylogenyNode implements Comparable<PhylogenyNode> {
         setSumExtNodes( 1 ); // For ext node, this number is 1 (not 0!!)
     }
 
+    public PhylogenyNode( final String node_name ) {
+        setId( PhylogenyNode.getNodeCount() );
+        PhylogenyNode.increaseNodeCount();
+        setSumExtNodes( 1 ); // For ext node, this number is 1 (not 0!!)
+        if ( node_name != null ) {
+            getNodeData().setNodeName( node_name );
+        }
+    }
+
     private PhylogenyNode( final String nhx,
                            final NHXParser.TAXONOMY_EXTRACTION taxonomy_extraction,
                            final boolean replace_underscores ) throws NHXFormatException, PhyloXmlDataFormatException {
index 410dd17..64b7807 100644 (file)
@@ -37,7 +37,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Set;
 import java.util.SortedSet;
-import java.util.TreeSet;
 
 import org.forester.application.support_transfer;
 import org.forester.archaeopteryx.TreePanelUtil;
@@ -128,6 +127,7 @@ import org.forester.ws.wabi.TxSearch.TAX_RANK;
 @SuppressWarnings( "unused")
 public final class Test {
 
+    private final static boolean PERFORM_DB_TESTS          = true;
     private final static double  ZERO_DIFF                 = 1.0E-9;
     private final static String  PATH_TO_TEST_DATA         = System.getProperty( "user.dir" )
                                                                    + ForesterUtil.getFileSeparator() + "test_data"
@@ -255,15 +255,17 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
-        System.out.print( "Sequence DB tools 2: " );
-        if ( testSequenceDbWsTools2() ) {
-            System.out.println( "OK." );
-            succeeded++;
-        }
-        else {
-            System.out.println( "failed." );
-            failed++;
-            System.exit( -1 );
+        if ( PERFORM_DB_TESTS ) {
+            System.out.print( "Sequence DB tools 2: " );
+            if ( testSequenceDbWsTools2() ) {
+                System.out.println( "OK." );
+                succeeded++;
+            }
+            else {
+                System.out.println( "failed." );
+                failed++;
+                System.exit( -1 );
+            }
         }
         System.exit( 0 );
         System.out.print( "Hmmscan output parser: " );
@@ -836,23 +838,27 @@ public final class Test {
             System.out.println( "failed." );
             failed++;
         }
-        System.out.print( "Uniprot Entry Retrieval: " );
-        if ( Test.testUniprotEntryRetrieval() ) {
-            System.out.println( "OK." );
-            succeeded++;
-        }
-        else {
-            System.out.println( "failed." );
-            failed++;
-        }
-        System.out.print( "Uniprot Taxonomy Search: " );
-        if ( Test.testUniprotTaxonomySearch() ) {
-            System.out.println( "OK." );
-            succeeded++;
+        if ( PERFORM_DB_TESTS ) {
+            System.out.print( "Uniprot Entry Retrieval: " );
+            if ( Test.testUniprotEntryRetrieval() ) {
+                System.out.println( "OK." );
+                succeeded++;
+            }
+            else {
+                System.out.println( "failed." );
+                failed++;
+            }
         }
-        else {
-            System.out.println( "failed." );
-            failed++;
+        if ( PERFORM_DB_TESTS ) {
+            System.out.print( "Uniprot Taxonomy Search: " );
+            if ( Test.testUniprotTaxonomySearch() ) {
+                System.out.println( "OK." );
+                succeeded++;
+            }
+            else {
+                System.out.println( "failed." );
+                failed++;
+            }
         }
         //----
         String path = "";
@@ -10861,104 +10867,105 @@ public final class Test {
 
     private static boolean testSequenceDbWsTools1() {
         try {
-            PhylogenyNode n = new PhylogenyNode();
+            final PhylogenyNode n = new PhylogenyNode();
             n.setName( "NP_001025424" );
             Accession acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.REFSEQ.toString() )
                     || !acc.getValue().equals( "NP_001025424" ) ) {
                 return false;
             }
             n.setName( "340 0559 -- _NP_001025424_dsfdg15 05" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.REFSEQ.toString() )
                     || !acc.getValue().equals( "NP_001025424" ) ) {
                 return false;
             }
             n.setName( "NP_001025424.1" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.REFSEQ.toString() )
                     || !acc.getValue().equals( "NP_001025424" ) ) {
                 return false;
             }
             n.setName( "NM_001030253" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.REFSEQ.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.REFSEQ.toString() )
                     || !acc.getValue().equals( "NM_001030253" ) ) {
                 return false;
             }
             n.setName( "BCL2_HUMAN" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.UNIPROT.toString() )
                     || !acc.getValue().equals( "BCL2_HUMAN" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "P10415" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.UNIPROT.toString() )
                     || !acc.getValue().equals( "P10415" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( " P10415 " );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.UNIPROT.toString() )
                     || !acc.getValue().equals( "P10415" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "_P10415|" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.UNIPROT.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.UNIPROT.toString() )
                     || !acc.getValue().equals( "P10415" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "AY695820" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
                     || !acc.getValue().equals( "AY695820" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "_AY695820_" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
                     || !acc.getValue().equals( "AY695820" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "AAA59452" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
                     || !acc.getValue().equals( "AAA59452" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "_AAA59452_" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
                     || !acc.getValue().equals( "AAA59452" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "AAA59452.1" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
                     || !acc.getValue().equals( "AAA59452.1" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "_AAA59452.1_" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.NCBI.toString() )
+            if ( ( acc == null ) || !acc.getSource().equals( Source.NCBI.toString() )
                     || !acc.getValue().equals( "AAA59452.1" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
             n.setName( "GI:94894583" );
             acc = SequenceDbWsTools.obtainSeqAccession( n );
-            if ( acc == null || !acc.getSource().equals( Source.GI.toString() ) || !acc.getValue().equals( "94894583" ) ) {
+            if ( ( acc == null ) || !acc.getSource().equals( Source.GI.toString() )
+                    || !acc.getValue().equals( "94894583" ) ) {
                 System.out.println( acc.toString() );
                 return false;
             }
@@ -10971,18 +10978,22 @@ public final class Test {
 
     private static boolean testSequenceDbWsTools2() {
         try {
-            PhylogenyNode n1 = new PhylogenyNode();
-            n1.setName( "NP_001025424" );
-            SequenceDbWsTools.obtainSeqInformation( false, 4000, new TreeSet<String>(), n1 );
+            final PhylogenyNode n1 = new PhylogenyNode( "NP_001025424" );
+            SequenceDbWsTools.obtainSeqInformation( n1 );
             if ( !n1.getNodeData().getSequence().getName().equals( "Bcl2" ) ) {
                 return false;
             }
             if ( !n1.getNodeData().getTaxonomy().getScientificName().equals( "Danio rerio" ) ) {
                 return false;
             }
-            PhylogenyNode n2 = new PhylogenyNode();
-            n2.setName( "NM_001030253" );
-            SequenceDbWsTools.obtainSeqInformation( false, 4000, new TreeSet<String>(), n2 );
+            if ( !n1.getNodeData().getSequence().getAccession().getSource().equals( Source.REFSEQ.toString() ) ) {
+                return false;
+            }
+            if ( !n1.getNodeData().getSequence().getAccession().getValue().equals( "NP_001025424" ) ) {
+                return false;
+            }
+            final PhylogenyNode n2 = new PhylogenyNode( "NM_001030253" );
+            SequenceDbWsTools.obtainSeqInformation( n2 );
             System.out.println( n2.toString() );
             if ( !n2.getNodeData().getSequence().getName()
                     .equals( "Danio rerio B-cell leukemia/lymphoma 2 (bcl2), mRNA" ) ) {
@@ -10991,6 +11002,28 @@ public final class Test {
             if ( !n2.getNodeData().getTaxonomy().getScientificName().equals( "Danio rerio" ) ) {
                 return false;
             }
+            if ( !n2.getNodeData().getSequence().getAccession().getSource().equals( Source.REFSEQ.toString() ) ) {
+                return false;
+            }
+            if ( !n2.getNodeData().getSequence().getAccession().getValue().equals( "NM_001030253" ) ) {
+                return false;
+            }
+            final PhylogenyNode n3 = new PhylogenyNode( "NM_184234.2" );
+            SequenceDbWsTools.obtainSeqInformation( n3 );
+            System.out.println( "n=" + n3.toString() );
+            if ( !n3.getNodeData().getSequence().getName()
+                    .equals( "Homo sapiens RNA binding motif protein 39 (RBM39), transcript variant 1, mRNA" ) ) {
+                return false;
+            }
+            if ( !n3.getNodeData().getTaxonomy().getScientificName().equals( "Homo sapiens" ) ) {
+                return false;
+            }
+            if ( !n3.getNodeData().getSequence().getAccession().getSource().equals( Source.REFSEQ.toString() ) ) {
+                return false;
+            }
+            if ( !n3.getNodeData().getSequence().getAccession().getValue().equals( "NM_184234" ) ) {
+                return false;
+            }
         }
         catch ( final IOException e ) {
             System.out.println();
@@ -10999,6 +11032,7 @@ public final class Test {
             return true;
         }
         catch ( final Exception e ) {
+            e.printStackTrace();
             return false;
         }
         return true;
diff --git a/forester/java/src/org/forester/ws/seqdb/DatabaseTools.java b/forester/java/src/org/forester/ws/seqdb/DatabaseTools.java
deleted file mode 100644 (file)
index c8e1a4b..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-
-package org.forester.ws.seqdb;
-
-public class DatabaseTools {
-
-    static String extract( final String target, final String a, final String b ) {
-        final int i_a = target.indexOf( a );
-        final int i_b = target.indexOf( b );
-        if ( ( i_a < 0 ) || ( i_b < i_a ) ) {
-            throw new IllegalArgumentException( "attempt to extract from [" + target + "] between [" + a + "] and ["
-                    + b + "]" );
-        }
-        return target.substring( i_a + a.length(), i_b ).trim();
-    }
-
-    static String extract( final String target, final String a ) {
-        final int i_a = target.indexOf( a );
-        return target.substring( i_a + a.length() ).trim();
-    }
-}
index 65ae847..b1545b3 100644 (file)
@@ -33,6 +33,7 @@ import org.forester.util.ForesterUtil;
 
 public final class EbiDbEntry implements SequenceDatabaseEntry {
 
+    // FIXME actually this is NCBI entry
     //http://www.ebi.ac.uk/Tools/dbfetch/dbfetch/emb/AAR37336/
     private String _pa;
     private String _de;
@@ -41,6 +42,454 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
     private String _symbol;
     private String _provider;
 
+    // TODO  PUBMED   15798186
+    //TODO  (FEATURES) 
+    // source /db_xref="taxon:9606"
+    // gene            1..2881  
+    // /gene="RBM39" 
+    //
+    // /db_xref="MIM:604739"  
+    // CDS
+    // /gene="RBM39"
+    // /db_xref="MIM:604739"
+    // /db_xref="InterPro:IPR002475"
+    // /product="Bcl-2"
+    // /protein_id="NP_909122.1"
+    // /db_xref="UniProtKB/TrEMBL:Q5J7V1" <- reparse?
+    //
+    // Protein
+    /*
+    LOCUS       NM_184234               2881 bp    mRNA    linear   PRI 16-JUN-2013
+    DEFINITION  Homo sapiens RNA binding motif protein 39 (RBM39), transcript
+            variant 1, mRNA.
+    ACCESSION   NM_184234
+    VERSION     NM_184234.2  GI:336176061
+    KEYWORDS    RefSeq.
+    SOURCE      Homo sapiens (human)
+    ORGANISM  Homo sapiens
+            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
+            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
+            Catarrhini; Hominidae; Homo.
+    REFERENCE   1  (bases 1 to 2881)
+    AUTHORS   Sillars-Hardebol,A.H., Carvalho,B., Belien,J.A., de Wit,M.,
+            Delis-van Diemen,P.M., Tijssen,M., van de Wiel,M.A., Ponten,F.,
+            Meijer,G.A. and Fijneman,R.J.
+    TITLE     CSE1L, DIDO1 and RBM39 in colorectal adenoma to carcinoma
+            progression
+    JOURNAL   Cell Oncol (Dordr) 35 (4), 293-300 (2012)
+    PUBMED   22711543
+    REMARK    GeneRIF: Data show that CSE1L, DIDO1 and RBM39 mRNA expression
+            levels correlated with chromosome 20q DNA copy number status.
+    REFERENCE   2  (bases 1 to 2881)
+    AUTHORS   Huang,G., Zhou,Z., Wang,H. and Kleinerman,E.S.
+    TITLE     CAPER-alpha alternative splicing regulates the expression of
+            vascular endothelial growth factor(1)(6)(5) in Ewing sarcoma cells
+    JOURNAL   Cancer 118 (8), 2106-2116 (2012)
+    PUBMED   22009261
+    REMARK    GeneRIF: Increased VEGF(165) expression is secondary to the
+            down-regulation of CAPER-alpha by EWS/FLI-1. CAPER-alpha mediates
+            alternative splicing and controls the shift from VEGF(189) to
+            VEGF(165) .
+    REFERENCE   3  (bases 1 to 2881)
+    AUTHORS   Han,B., Stockwin,L.H., Hancock,C., Yu,S.X., Hollingshead,M.G. and
+            Newton,D.L.
+    TITLE     Proteomic analysis of nuclei isolated from cancer cell lines
+            treated with indenoisoquinoline NSC 724998, a novel topoisomerase I
+            inhibitor
+    JOURNAL   J. Proteome Res. 9 (8), 4016-4027 (2010)
+    PUBMED   20515076
+    REMARK    Erratum:[J Proteome Res. 2011 Apr 1;10(4):2128]
+    REFERENCE   4  (bases 1 to 2881)
+    AUTHORS   Zhang,J.Y., Looi,K.S. and Tan,E.M.
+    TITLE     Identification of tumor-associated antigens as diagnostic and
+            predictive biomarkers in cancer
+    JOURNAL   Methods Mol. Biol. 520, 1-10 (2009)
+    PUBMED   19381943
+    REFERENCE   5  (bases 1 to 2881)
+    AUTHORS   Dutta,J., Fan,G. and Gelinas,C.
+    TITLE     CAPERalpha is a novel Rel-TAD-interacting factor that inhibits
+            lymphocyte transformation by the potent Rel/NF-kappaB oncoprotein
+            v-Rel
+    JOURNAL   J. Virol. 82 (21), 10792-10802 (2008)
+    PUBMED   18753212
+    REMARK    GeneRIF: this study identifies CAPERalpha (RNA binding motif
+            protein 39) as a new transcriptional coregulator for v-Rel and
+            reveals an important role in modulating Rel's oncogenic activity.
+    REFERENCE   6  (bases 1 to 2881)
+    AUTHORS   Cazalla,D., Newton,K. and Caceres,J.F.
+    TITLE     A novel SR-related protein is required for the second step of
+            Pre-mRNA splicing
+    JOURNAL   Mol. Cell. Biol. 25 (8), 2969-2980 (2005)
+    PUBMED   15798186
+    REFERENCE   7  (bases 1 to 2881)
+    AUTHORS   Dowhan,D.H., Hong,E.P., Auboeuf,D., Dennis,A.P., Wilson,M.M.,
+            Berget,S.M. and O'Malley,B.W.
+    TITLE     Steroid hormone receptor coactivation and alternative RNA splicing
+            by U2AF65-related proteins CAPERalpha and CAPERbeta
+    JOURNAL   Mol. Cell 17 (3), 429-439 (2005)
+    PUBMED   15694343
+    REFERENCE   8  (bases 1 to 2881)
+    AUTHORS   Sun,N.N., Fastje,C.D., Wong,S.S., Sheppard,P.R., Macdonald,S.J.,
+            Ridenour,G., Hyde,J.D. and Witten,M.L.
+    TITLE     Dose-dependent transcriptome changes by metal ores on a human acute
+            lymphoblastic leukemia cell line
+    JOURNAL   Toxicol Ind Health 19 (7-10), 157-163 (2003)
+    PUBMED   15747776
+    REMARK    GeneRIF: 10 genes were down-regulated following treatment of the
+            T-ALL cells with 0.15 and 1.5 microg/mL of metal ores at 72 h
+    REFERENCE   9  (bases 1 to 2881)
+    AUTHORS   Jung,D.J., Na,S.Y., Na,D.S. and Lee,J.W.
+    TITLE     Molecular cloning and characterization of CAPER, a novel
+            coactivator of activating protein-1 and estrogen receptors
+    JOURNAL   J. Biol. Chem. 277 (2), 1229-1234 (2002)
+    PUBMED   11704680
+    REMARK    GeneRIF: This paper describes the mouse gene.
+    REFERENCE   10 (bases 1 to 2881)
+    AUTHORS   Imai,H., Chan,E.K., Kiyosawa,K., Fu,X.D. and Tan,E.M.
+    TITLE     Novel nuclear autoantigen with splicing factor motifs identified
+            with antibody from hepatocellular carcinoma
+    JOURNAL   J. Clin. Invest. 92 (5), 2419-2426 (1993)
+    PUBMED   8227358
+    COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff. The
+            reference sequence was derived from DC346351.1, BC141835.1 and
+            C75555.1.
+            On Jun 16, 2011 this sequence version replaced gi:35493810.
+            
+            Summary: This gene encodes a member of the U2AF65 family of
+            proteins. The encoded protein is found in the nucleus, where it
+            co-localizes with core spliceosomal proteins. It has been shown to
+            play a role in both steroid hormone receptor-mediated transcription
+            and alternative splicing, and it is also a transcriptional
+            coregulator of the viral oncoprotein v-Rel. Multiple transcript
+            variants have been observed for this gene. A related pseudogene has
+            been identified on chromosome X. [provided by RefSeq, Aug 2011].
+            
+            Transcript Variant: This variant (1) encodes the longest isoform
+            (a, also called CC1.4).
+            
+            Publication Note:  This RefSeq record includes a subset of the
+            publications that are available for this gene. Please see the Gene
+            record to access additional publications.
+            
+            ##Evidence-Data-START##
+            Transcript exon combination :: BC141835.1, L10911.1 [ECO:0000332]
+            RNAseq introns              :: mixed/partial sample support
+                                           ERS025081, ERS025082 [ECO:0000350]
+            ##Evidence-Data-END##
+            COMPLETENESS: complete on the 3' end.
+    PRIMARY     REFSEQ_SPAN         PRIMARY_IDENTIFIER PRIMARY_SPAN        COMP
+            1-578               DC346351.1         3-580
+            579-2872            BC141835.1         429-2722
+            2873-2881           C75555.1           1-9                 c
+    FEATURES             Location/Qualifiers
+     source          1..2881
+                     /organism="Homo sapiens"
+                     /mol_type="mRNA"
+                     /db_xref="taxon:9606"
+                     /chromosome="20"
+                     /map="20q11.22"
+     gene            1..2881
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /note="RNA binding motif protein 39"
+                     /db_xref="GeneID:9584"
+                     /db_xref="HGNC:15923"
+                     /db_xref="HPRD:09201"
+                     /db_xref="MIM:604739"
+     exon            1..396
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     STS             35..262
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="REN58946"
+                     /db_xref="UniSTS:383746"
+     misc_feature    221..223
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /note="upstream in-frame stop codon"
+     STS             299..453
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="G64285"
+                     /db_xref="UniSTS:158667"
+     exon            397..460
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     CDS             410..2002
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /note="isoform a is encoded by transcript variant 1;
+                     coactivator of activating protein-1 and estrogen
+                     receptors; functional spliceosome-associated protein 59;
+                     RNA-binding region (RNP1, RRM) containing 2;
+                     hepatocellular carcinoma protein 1; splicing factor HCC1"
+                     /codon_start=1
+                     /product="RNA-binding protein 39 isoform a"
+                     /protein_id="NP_909122.1"
+                     /db_xref="GI:35493811"
+                     /db_xref="CCDS:CCDS13266.1"
+                     /db_xref="GeneID:9584"
+                     /db_xref="HGNC:15923"
+                     /db_xref="HPRD:09201"
+                     /db_xref="MIM:604739"
+                     /translation="MADDIDIEAMLEAPYKKDENKLSSANGHEERSKKRKKSKSRSRS
+                     HERKRSKSKERKRSRDRERKKSKSRERKRSRSKERRRSRSRSRDRRFRGRYRSPYSGP
+                     KFNSAIRGKIGLPHSIKLSRRRSRSKSPFRKDKSPVREPIDNLTPEERDARTVFCMQL
+                     AARIRPRDLEEFFSTVGKVRDVRMISDRNSRRSKGIAYVEFVDVSSVPLAIGLTGQRV
+                     LGVPIIVQASQAEKNRAAAMANNLQKGSAGPMRLYVGSLHFNITEDMLRGIFEPFGRI
+                     ESIQLMMDSETGRSKGYGFITFSDSECAKKALEQLNGFELAGRPMKVGHVTERTDASS
+                     ASSFLDSDELERTGIDLGTTGRLQLMARLAEGTGLQIPPAAQQALQMSGSLAFGAVAE
+                     FSFVIDLQTRLSQQTEASALAAAASVQPLATQCFQLSNMFNPQTEEEVGWDTEIKDDV
+                     IEECNKHGGVIHIYVDKNSAQGNVYVKCPSIAAAIAAVNALHGRWFAGKMITAAYVPL
+                     PTYHNLFPDSMTATQLLVPSRR"
+     misc_feature    413..415
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="N-acetylalanine; propagated from
+                     UniProtKB/Swiss-Prot (Q14498.2); acetylation site"
+     misc_feature    692..694
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphotyrosine; propagated from
+                     UniProtKB/Swiss-Prot (Q14498.2); phosphorylation site"
+     misc_feature    698..700
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphoserine; propagated from UniProtKB/Swiss-Prot
+                     (Q14498.2); phosphorylation site"
+     misc_feature    707..709
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphoserine; propagated from UniProtKB/Swiss-Prot
+                     (Q14498.2); phosphorylation site"
+     misc_feature    815..817
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphoserine; propagated from UniProtKB/Swiss-Prot
+                     (Q14498.2); phosphorylation site"
+     misc_feature    845..847
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphothreonine; propagated from
+                     UniProtKB/Swiss-Prot (Q14498.2); phosphorylation site"
+     misc_feature    1280..1627
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="non-experimental evidence, no additional
+                     details recorded"
+                     /note="propagated from UniProtKB/Swiss-Prot (Q14498.2);
+                     Region: Interaction with JUN (By similarity)"
+     misc_feature    1280..1474
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="non-experimental evidence, no additional
+                     details recorded"
+                     /note="propagated from UniProtKB/Swiss-Prot (Q14498.2);
+                     Region: Activating domain (By similarity)"
+     misc_feature    1409..1411
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphoserine; propagated from UniProtKB/Swiss-Prot
+                     (Q14498.2); phosphorylation site"
+     misc_feature    1418..1420
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphoserine; propagated from UniProtKB/Swiss-Prot
+                     (Q14498.2); phosphorylation site"
+     misc_feature    1430..1432
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /experiment="experimental evidence, no additional details
+                     recorded"
+                     /note="Phosphoserine; propagated from UniProtKB/Swiss-Prot
+                     (Q14498.2); phosphorylation site"
+     misc_feature    1472..1627
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="non-experimental evidence, no additional
+                     details recorded"
+                     /note="propagated from UniProtKB/Swiss-Prot (Q14498.2);
+                     Region: Interaction with ESR1 and ESR2 (By similarity)"
+     misc_feature    1625..1999
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="non-experimental evidence, no additional
+                     details recorded"
+                     /note="propagated from UniProtKB/Swiss-Prot (Q14498.2);
+                     Region: Interaction with NCOA6 (By similarity)"
+     exon            461..510
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            511..705
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            706..771
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            772..825
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            826..943
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            944..1096
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1097..1234
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1235..1300
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1301..1505
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1506..1583
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1584..1634
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1635..1716
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1717..1822
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1823..1901
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     exon            1902..2874
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /inference="alignment:Splign:1.39.8"
+     STS             1956..2182
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="REN58786"
+                     /db_xref="UniSTS:383586"
+     STS             2104..2148
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="D19S1033"
+                     /db_xref="UniSTS:154759"
+     STS             2145..2400
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="REN58785"
+                     /db_xref="UniSTS:383585"
+     STS             2349..2590
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="REN58784"
+                     /db_xref="UniSTS:383584"
+     STS             2450..2669
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="RH69003"
+                     /db_xref="UniSTS:85360"
+     STS             2579..2828
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="REN58783"
+                     /db_xref="UniSTS:383583"
+     STS             2639..2728
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+                     /standard_name="RH67917"
+                     /db_xref="UniSTS:84037"
+     polyA_signal    2851..2856
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+     polyA_site      2874
+                     /gene="RBM39"
+                     /gene_synonym="CAPER; CAPERalpha; FSAP59; HCC1; RNPC2"
+    ORIGIN      
+        1 atttggagct tggggcagct tctcgcgaga gcccgtgctg agggctctgt gaggccccgt
+       61 gtgtttgtgt gtgtgtatgt gtgctggtga atgtgagtac agggaagcag cggccgccat
+      121 ttcagggagc ttgtcgacgc tgtcgcaggg gtggatcctg agctgccgaa gccgccgtcc
+      181 tgctctcccg cgtgggcttc tctaattcca ttgttttttt tagattctct cgggcctagc
+      241 cgtccttgga acccgatatt cgggctgggc ggttccgcgg cctgggccta ggggcttaac
+      301 agtagcaaca gaagcggcgg cggcggcagc agcagcagca gcagcagcaa tctcttcccg
+      361 aacacgagca ccacaggcgc ccgaaggccg gaacaggcgt ttagagaaaa tggcagacga
+      421 tattgatatt gaagcaatgc ttgaggctcc ttacaagaag gatgagaaca agttgagcag
+      481 tgccaacggc catgaagaac gtagcaaaaa gaggaaaaaa agcaagagca gaagtcgtag
+      541 tcatgaacga aagagaagca aaagtaagga acggaagcga agtagagaca gagaaaggaa
+      601 aaagagcaaa agccgtgaaa gaaagcgaag tagaagcaaa gagaggcgac ggagccgctc
+      661 aagaagtcga gatcgaagat ttagaggccg ctacagaagt ccttactccg gaccaaaatt
+      721 taacagtgcc atccgaggaa agattgggtt gcctcatagc atcaaattaa gcagacgacg
+      781 ttcccgaagc aaaagtccat tcagaaaaga caagagccct gtgagagaac ctattgataa
+      841 tttaactcct gaggaaagag atgcaaggac agtcttctgt atgcagctgg cggcaagaat
+      901 tcgaccaagg gatttggaag agtttttctc tacagtagga aaggttcgag atgtgaggat
+      961 gatttctgac agaaattcaa gacgttccaa aggaattgct tatgtggagt tcgtcgatgt
+     1021 tagctcagtg cctctagcaa taggattaac tggccaacga gttttaggcg tgccaatcat
+     1081 agtacaggca tcacaggcag aaaaaaacag agctgcagca atggcaaaca atttacaaaa
+     1141 gggaagtgct ggacctatga ggctttatgt gggctcatta cacttcaaca taactgaaga
+     1201 tatgcttcgt gggatctttg agccttttgg aagaattgaa agtatccagc tgatgatgga
+     1261 cagtgaaact ggtcgatcca agggatatgg atttattaca ttttctgact cagaatgtgc
+     1321 caaaaaggct ttggaacaac ttaatggatt tgaactagca ggaagaccaa tgaaagttgg
+     1381 tcatgttact gaacgtactg atgcttcgag tgctagttca tttttggaca gtgatgaact
+     1441 ggaaaggact ggaattgatt tgggaacaac tggtcgtctt cagttaatgg caagacttgc
+     1501 agagggtaca ggtttgcaga ttccgccagc agcacagcaa gctctacaga tgagtggctc
+     1561 tttggcattt ggtgctgtgg cagaattctc ttttgttata gatttgcaaa caagactttc
+     1621 ccagcagact gaagcttcag ctttagctgc agctgcctct gttcagccac ttgcaacaca
+     1681 atgtttccaa ctctctaaca tgtttaaccc tcaaacagaa gaagaagttg gatgggatac
+     1741 cgagattaag gatgatgtga ttgaagaatg taataaacat ggaggagtta ttcatattta
+     1801 tgttgacaaa aattcagctc agggcaatgt gtatgtgaag tgcccatcaa ttgctgcagc
+     1861 tattgctgct gtcaatgcat tgcatggcag gtggtttgct ggtaaaatga taacagcagc
+     1921 atatgtacct cttccaactt accacaacct gtttcctgat tctatgacag caacacagct
+     1981 actggttcca agtagacgat gaaggaagat atagtccctt atgtatatag ctttttttct
+     2041 ttcttgagaa ttcatcttga gttatctttt atttagataa aaataaagag gcaaggatct
+     2101 actgtcattt gtatgcaatt tcctgttacc ttgaaaaaat aaaaatgtta acaggaatgc
+     2161 agtgtgctca ttctccctaa atagtaaatc ccactgtata caaaactgtt ctcttgttct
+     2221 gccttttaaa atgttcatgt agaaaattaa tgaactatag gaatagctct aggagaacaa
+     2281 atgtgctttc tgtaaaaagg cagaccaggg atgtaatgtt tttaatgttt cagaagccta
+     2341 actttttaca cagtggttac atttcacatt tcactaatgt tgatatttgg ctgatggttg
+     2401 agcagtttct gaaatacaca tttagtgtat ggaaatacaa gacagctaaa gggctgtttg
+     2461 gttagcatct catcttgcat tctgatcaat tggcaagaaa gggagatttc aaaattatat
+     2521 ttcttgatgg tatcttttca attaatgtat ctgtaaaagt ttctttgtaa atactatgtg
+     2581 ttctggtgtg tcttaaaatt ccaaacaaaa tgatccctgc atttcctgaa gatgtttaaa
+     2641 cgtgagagtc tggtaggcaa agcagtctga gaaagaaata ggaaatgcag aaataggttt
+     2701 tgtctggttg catataatct ttgctctttt taagctctgt gagctctgaa atatattttt
+     2761 gggttacttc agtgtgtttg acaagacagc ttgatatttc tatcaaacaa atgactttca
+     2821 tattgcaaca atctttgtaa gaaccactca aataaaagtc tcttaaaaag gccaaaaaaa
+     2881 a
+    
+    
+    */
     private EbiDbEntry() {
     }
 
@@ -51,30 +500,53 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
 
     public static SequenceDatabaseEntry createInstanceFromPlainTextForRefSeq( final List<String> lines ) {
         final EbiDbEntry e = new EbiDbEntry();
+        final StringBuilder def = new StringBuilder();
+        boolean in_def = false;
         for( final String line : lines ) {
             //  System.out.println( "-" + line );
             if ( line.startsWith( "ACCESSION" ) ) {
-                e.setPA( DatabaseTools.extract( line, "ACCESSION" ) );
+                e.setPA( SequenceDbWsTools.extractFrom( line, "ACCESSION" ) );
+                in_def = false;
             }
             else if ( line.startsWith( "DEFINITION" ) ) {
                 if ( line.indexOf( "[" ) > 0 ) {
-                    e.setDe( DatabaseTools.extract( line, "DEFINITION", "[" ) );
+                    def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "[" ) );
                 }
                 else if ( line.indexOf( "." ) > 0 ) {
-                    e.setDe( DatabaseTools.extract( line, "DEFINITION", "." ) );
+                    def.append( SequenceDbWsTools.extractFromTo( line, "DEFINITION", "." ) );
                 }
                 else {
-                    e.setDe( DatabaseTools.extract( line, "DEFINITION" ) );
+                    def.append( SequenceDbWsTools.extractFrom( line, "DEFINITION" ) );
                 }
+                in_def = true;
             }
             else if ( line.startsWith( "SOURCE" ) ) {
                 if ( line.indexOf( "(" ) > 0 ) {
-                    e.setOs( DatabaseTools.extract( line, "SOURCE", "(" ) );
+                    e.setOs( SequenceDbWsTools.extractFromTo( line, "SOURCE", "(" ) );
                 }
                 else {
-                    e.setOs( DatabaseTools.extract( line, "SOURCE" ) );
+                    e.setOs( SequenceDbWsTools.extractFrom( line, "SOURCE" ) );
                 }
+                in_def = false;
             }
+            else if ( line.startsWith( " " ) && in_def ) {
+                def.append( " " );
+                if ( line.indexOf( "[" ) > 0 ) {
+                    def.append( SequenceDbWsTools.extractTo( line, "[" ) );
+                }
+                else if ( line.indexOf( "." ) > 0 ) {
+                    def.append( SequenceDbWsTools.extractTo( line, "." ) );
+                }
+                else {
+                    def.append( line.trim() );
+                }
+            }
+            else {
+                in_def = false;
+            }
+        }
+        if ( def.length() > 0 ) {
+            e.setDe( def.toString().trim() );
         }
         return e;
     }
@@ -83,22 +555,22 @@ public final class EbiDbEntry implements SequenceDatabaseEntry {
         final EbiDbEntry e = new EbiDbEntry();
         for( final String line : lines ) {
             if ( line.startsWith( "PA" ) ) {
-                e.setPA( DatabaseTools.extract( line, "PA" ) );
+                e.setPA( SequenceDbWsTools.extractFrom( line, "PA" ) );
             }
             else if ( line.startsWith( "DE" ) ) {
-                e.setDe( DatabaseTools.extract( line, "DE" ) );
+                e.setDe( SequenceDbWsTools.extractFrom( line, "DE" ) );
             }
             else if ( line.startsWith( "OS" ) ) {
                 if ( line.indexOf( "(" ) > 0 ) {
-                    e.setOs( DatabaseTools.extract( line, "OS", "(" ) );
+                    e.setOs( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
                 }
                 else {
-                    e.setOs( DatabaseTools.extract( line, "OS" ) );
+                    e.setOs( SequenceDbWsTools.extractFrom( line, "OS" ) );
                 }
             }
             else if ( line.startsWith( "OX" ) ) {
                 if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
-                    e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) );
+                    e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
                 }
             }
         }
index 55eead3..cecef9f 100644 (file)
@@ -54,13 +54,34 @@ import org.forester.util.SequenceAccessionTools;
 
 public final class SequenceDbWsTools {
 
-    public final static String   EMBL_REFSEQ       = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id=";
-    public final static String   BASE_UNIPROT_URL  = "http://www.uniprot.org/";
-    public final static String   EMBL_DBS_EMBL     = "embl";
-    public final static String   EMBL_DBS_REFSEQ_N = "refseqn";
-    public final static String   EMBL_DBS_REFSEQ_P = "refseqp";
-    private final static boolean DEBUG             = true;
-    private final static String  URL_ENC           = "UTF-8";
+    public final static String   EMBL_REFSEQ             = "http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=REFSEQ&style=raw&id=";
+    public final static String   BASE_UNIPROT_URL        = "http://www.uniprot.org/";
+    public final static String   EMBL_DBS_EMBL           = "embl";
+    public final static String   EMBL_DBS_REFSEQ_N       = "refseqn";
+    public final static String   EMBL_DBS_REFSEQ_P       = "refseqp";
+    private final static boolean DEBUG                   = true;
+    private final static String  URL_ENC                 = "UTF-8";
+    public final static int      DEFAULT_LINES_TO_RETURN = 4000;
+
+    final static String extractFrom( final String target, final String a ) {
+        final int i_a = target.indexOf( a );
+        return target.substring( i_a + a.length() ).trim();
+    }
+
+    final static String extractFromTo( final String target, final String a, final String b ) {
+        final int i_a = target.indexOf( a );
+        final int i_b = target.indexOf( b );
+        if ( ( i_a < 0 ) || ( i_b < i_a ) ) {
+            throw new IllegalArgumentException( "attempt to extract from \"" + target + "\" between \"" + a
+                    + "\" and \"" + b + "\"" );
+        }
+        return target.substring( i_a + a.length(), i_b ).trim();
+    }
+
+    final static String extractTo( final String target, final String b ) {
+        final int i_b = target.indexOf( b );
+        return target.substring( 0, i_b ).trim();
+    }
 
     public static List<UniProtTaxonomy> getTaxonomiesFromCommonNameStrict( final String cn,
                                                                            final int max_taxonomies_return )
@@ -155,6 +176,16 @@ public final class SequenceDbWsTools {
         }
     }
 
+    public final static void obtainSeqInformation( final boolean allow_to_set_taxonomic_data,
+                                                   final SortedSet<String> not_found,
+                                                   final PhylogenyNode node ) throws IOException {
+        obtainSeqInformation( allow_to_set_taxonomic_data, DEFAULT_LINES_TO_RETURN, not_found, node );
+    }
+
+    public final static void obtainSeqInformation( final PhylogenyNode node ) throws IOException {
+        obtainSeqInformation( true, DEFAULT_LINES_TO_RETURN, new TreeSet<String>(), node );
+    }
+
     public final static SortedSet<String> obtainSeqInformation( final Phylogeny phy,
                                                                 final boolean ext_nodes_only,
                                                                 final boolean allow_to_set_taxonomic_data,
index 4a8d158..4ba10de 100644 (file)
@@ -174,24 +174,24 @@ public final class UniProtEntry implements SequenceDatabaseEntry {
         for( final String line : lines ) {
             //System.out.println( line );
             if ( line.startsWith( "AC" ) ) {
-                e.setAc( DatabaseTools.extract( line, "AC", ";" ) );
+                e.setAc( SequenceDbWsTools.extractFromTo( line, "AC", ";" ) );
             }
             else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceName() ) ) {
                 if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
-                    e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) );
+                    e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) );
                 }
                 else if ( ( line.indexOf( "SubName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
-                    e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) );
+                    e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) );
                 }
             }
             else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceSymbol() ) ) {
                 if ( line.indexOf( "Short=" ) > 0 ) {
-                    e.setSequenceSymbol( DatabaseTools.extract( line, "Short=", ";" ) );
+                    e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", ";" ) );
                 }
             }
             else if ( line.startsWith( "GN" ) && ForesterUtil.isEmpty( e.getGeneName() ) ) {
                 if ( line.indexOf( "Name=" ) > 0 ) {
-                    e.setGeneName( DatabaseTools.extract( line, "Name=", ";" ) );
+                    e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", ";" ) );
                 }
             }
             else if ( line.startsWith( "DR" ) ) {
@@ -274,15 +274,15 @@ public final class UniProtEntry implements SequenceDatabaseEntry {
             }
             else if ( line.startsWith( "OS" ) ) {
                 if ( line.indexOf( "(" ) > 0 ) {
-                    e.setOsScientificName( DatabaseTools.extract( line, "OS", "(" ) );
+                    e.setOsScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) );
                 }
                 else {
-                    e.setOsScientificName( DatabaseTools.extract( line, "OS", "." ) );
+                    e.setOsScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "." ) );
                 }
             }
             else if ( line.startsWith( "OX" ) ) {
                 if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
-                    e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) );
+                    e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
                 }
             }
         }