From: cmzmasek@gmail.com Date: Thu, 21 Apr 2011 20:24:03 +0000 (+0000) Subject: in progress X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=389376dc037d39d3c7983e2866cbfb47bebf416a;p=jalview.git in progress --- diff --git a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java index eea852c..9d3fab4 100644 --- a/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java +++ b/forester/java/src/org/forester/archaeopteryx/MainFrameApplication.java @@ -67,8 +67,8 @@ import org.forester.archaeopteryx.tools.GoAnnotation; import org.forester.archaeopteryx.tools.PhyloInferenceDialog; import org.forester.archaeopteryx.tools.PhylogeneticInferenceOptions; import org.forester.archaeopteryx.tools.PhylogeneticInferrer; +import org.forester.archaeopteryx.tools.SequenceDataRetriver; import org.forester.archaeopteryx.tools.TaxonomyDataObtainer; -import org.forester.archaeopteryx.tools.UniProtSequenceObtainer; import org.forester.archaeopteryx.webservices.PhylogeniesWebserviceClient; import org.forester.archaeopteryx.webservices.WebservicesManager; import org.forester.io.parsers.FastaParser; @@ -1401,9 +1401,9 @@ public final class MainFrameApplication extends MainFrame { if ( getCurrentTreePanel() != null ) { final Phylogeny phy = getCurrentTreePanel().getPhylogeny(); if ( ( phy != null ) && !phy.isEmpty() ) { - final UniProtSequenceObtainer u = new UniProtSequenceObtainer( this, - _mainpanel.getCurrentTreePanel(), - phy.copy() ); + final SequenceDataRetriver u = new SequenceDataRetriver( this, + _mainpanel.getCurrentTreePanel(), + phy.copy() ); new Thread( u ).start(); } } diff --git a/forester/java/src/org/forester/archaeopteryx/tools/UniProtSequenceObtainer.java b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java similarity index 67% rename from forester/java/src/org/forester/archaeopteryx/tools/UniProtSequenceObtainer.java rename to forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java index 8bea209..7f0477e 100644 --- a/forester/java/src/org/forester/archaeopteryx/tools/UniProtSequenceObtainer.java +++ b/forester/java/src/org/forester/archaeopteryx/tools/SequenceDataRetriver.java @@ -25,11 +25,12 @@ package org.forester.archaeopteryx.tools; +import java.io.FileNotFoundException; import java.io.IOException; import java.net.UnknownHostException; -import java.util.List; import java.util.SortedSet; import java.util.TreeSet; +import java.util.regex.Pattern; import javax.swing.JOptionPane; @@ -43,16 +44,24 @@ import org.forester.phylogeny.data.Sequence; import org.forester.phylogeny.data.Taxonomy; import org.forester.phylogeny.iterators.PhylogenyNodeIterator; import org.forester.util.ForesterUtil; -import org.forester.ws.uniprot.UniProtEntry; +import org.forester.ws.uniprot.SequenceDatabaseEntry; import org.forester.ws.uniprot.UniProtWsTools; -public class UniProtSequenceObtainer implements Runnable { +public final class SequenceDataRetriver implements Runnable { + // uniprot/expasy accession number format (6 chars): + // letter digit letter-or-digit letter-or-digit letter-or-digit digit + private final static Pattern UNIPROT_AC_PATTERN = Pattern.compile( "[A-NR-ZOPQ]\\d[A-Z0-9]{3}\\d" ); private final Phylogeny _phy; private final MainFrameApplication _mf; private final TreePanel _treepanel; + private final static boolean DEBUG = true; - public UniProtSequenceObtainer( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) { + private enum Db { + UNKNOWN, UNIPROT; + } + + public SequenceDataRetriver( final MainFrameApplication mf, final TreePanel treepanel, final Phylogeny phy ) { _phy = phy; _mf = mf; _treepanel = treepanel; @@ -145,14 +154,33 @@ public class UniProtSequenceObtainer implements Runnable { final SortedSet not_found = new TreeSet(); for( final PhylogenyNodeIterator iter = phy.iteratorPostorder(); iter.hasNext(); ) { final PhylogenyNode node = iter.next(); + Sequence seq = null; + Taxonomy tax = null; if ( node.getNodeData().isHasSequence() ) { - //TODO Do something + seq = node.getNodeData().getSequence(); + } + else { + seq = new Sequence(); + } + if ( node.getNodeData().isHasTaxonomy() ) { + tax = node.getNodeData().getTaxonomy(); + } + else { + tax = new Taxonomy(); + } + String query = null; + Db db = Db.UNKNOWN; + if ( node.getNodeData().isHasSequence() && ( node.getNodeData().getSequence().getAccession() != null ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getSource() ) + && !ForesterUtil.isEmpty( node.getNodeData().getSequence().getAccession().getValue() ) + && node.getNodeData().getSequence().getAccession().getValue().toLowerCase().startsWith( "uniprot" ) ) { + query = node.getNodeData().getSequence().getAccession().getValue(); + db = Db.UNIPROT; } - // else if ( !ForesterUtil.isEmpty( node.getName() ) ) { - // not_found.add( node.getName() ); - // } else if ( !ForesterUtil.isEmpty( node.getName() ) ) { - String query = node.getName(); + query = node.getName(); + } + if ( !ForesterUtil.isEmpty( query ) ) { if ( query.indexOf( '/' ) > 0 ) { query = query.substring( 0, query.indexOf( '/' ) ); } @@ -162,24 +190,33 @@ public class UniProtSequenceObtainer implements Runnable { if ( query.indexOf( '_' ) > 0 ) { query = query.substring( 0, query.indexOf( '_' ) ); } - final UniProtEntry upe = obtainUniProtEntry( query ); - if ( upe != null ) { - final Sequence seq = new Sequence(); - final Taxonomy tax = new Taxonomy(); - if ( !ForesterUtil.isEmpty( upe.getAc() ) ) { - seq.setAccession( new Accession( upe.getAc(), "uniprot" ) ); + SequenceDatabaseEntry db_entry = null; + if ( ( db == Db.UNIPROT ) || UNIPROT_AC_PATTERN.matcher( query ).matches() ) { + if ( DEBUG ) { + System.out.println( "uniprot: " + query ); + } + try { + db_entry = UniProtWsTools.obtainUniProtEntry( query, 200 ); } - if ( !ForesterUtil.isEmpty( upe.getRecName() ) ) { - seq.setName( upe.getRecName() ); + catch ( final FileNotFoundException e ) { + // Ignore. + } + } + if ( db_entry != null ) { + if ( !ForesterUtil.isEmpty( db_entry.getAccession() ) ) { + seq.setAccession( new Accession( db_entry.getAccession(), "uniprot" ) ); } - if ( !ForesterUtil.isEmpty( upe.getSymbol() ) ) { - seq.setSymbol( upe.getSymbol() ); + if ( !ForesterUtil.isEmpty( db_entry.getSequenceName() ) ) { + seq.setName( db_entry.getSequenceName() ); } - if ( !ForesterUtil.isEmpty( upe.getOsScientificName() ) ) { - tax.setScientificName( upe.getOsScientificName() ); + if ( !ForesterUtil.isEmpty( db_entry.getSequenceSymbol() ) ) { + seq.setSymbol( db_entry.getSequenceSymbol() ); } - if ( !ForesterUtil.isEmpty( upe.getTaxId() ) ) { - tax.setIdentifier( new Identifier( upe.getTaxId(), "uniprot" ) ); + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyScientificName() ) ) { + tax.setScientificName( db_entry.getTaxonomyScientificName() ); + } + if ( !ForesterUtil.isEmpty( db_entry.getTaxonomyIdentifier() ) ) { + tax.setIdentifier( new Identifier( db_entry.getTaxonomyIdentifier(), "uniprot" ) ); } node.getNodeData().setTaxonomy( tax ); node.getNodeData().setSequence( seq ); @@ -187,17 +224,11 @@ public class UniProtSequenceObtainer implements Runnable { else { not_found.add( node.getName() ); } - //} } } return not_found; } - static UniProtEntry obtainUniProtEntry( final String query ) throws IOException { - final List lines = UniProtWsTools.queryUniprot( "uniprot/" + query + ".txt", 200 ); - return UniProtEntry.createInstanceFromPlainText( lines ); - } - @Override public void run() { execute(); diff --git a/forester/java/src/org/forester/test/Test.java b/forester/java/src/org/forester/test/Test.java index 8a09ac8..a17dbe7 100644 --- a/forester/java/src/org/forester/test/Test.java +++ b/forester/java/src/org/forester/test/Test.java @@ -99,6 +99,7 @@ import org.forester.util.DescriptiveStatistics; import org.forester.util.ForesterConstants; import org.forester.util.ForesterUtil; import org.forester.util.GeneralTable; +import org.forester.ws.uniprot.SequenceDatabaseEntry; import org.forester.ws.uniprot.UniProtTaxonomy; import org.forester.ws.uniprot.UniProtWsTools; import org.forester.ws.wabi.TxSearch; @@ -645,14 +646,23 @@ public final class Test { System.out.println( "failed." ); failed++; } + System.out.print( "Uniprot Entry Retrieval: " ); + if ( Test.testUniprotEntryRetrieval() ) { + System.out.println( "OK." ); + succeeded++; + } + else { + System.out.println( "failed." ); + failed++; + } System.out.print( "Uniprot Taxonomy Search: " ); if ( Test.testUniprotTaxonomySearch() ) { System.out.println( "OK." ); succeeded++; } else { - System.out - .println( "failed [will not count towards failed tests since it might be due to absence internet connection]" ); + System.out.println( "failed." ); + failed++; } if ( Mafft.isInstalled() ) { System.out.print( "MAFFT (external program): " ); @@ -7709,10 +7719,44 @@ public final class Test { return false; } } + catch ( final IOException e ) { + System.out.println(); + System.out.println( "the following might be due to absence internet connection:" ); + e.printStackTrace( System.out ); + return true; + } catch ( final Exception e ) { + return false; + } + return true; + } + + private static boolean testUniprotEntryRetrieval() { + try { + final SequenceDatabaseEntry entry = UniProtWsTools.obtainUniProtEntry( "P12345", 200 ); + if ( !entry.getAccession().equals( "P12345" ) ) { + return false; + } + if ( !entry.getTaxonomyScientificName().equals( "Oryctolagus cuniculus" ) ) { + return false; + } + if ( !entry.getSequenceName().equals( "Aspartate aminotransferase, mitochondrial" ) ) { + return false; + } + if ( !entry.getSequenceSymbol().equals( "GOT2" ) ) { + return false; + } + if ( !entry.getTaxonomyIdentifier().equals( "9986" ) ) { + return false; + } + } + catch ( final IOException e ) { System.out.println(); System.out.println( "the following might be due to absence internet connection:" ); e.printStackTrace( System.out ); + return true; + } + catch ( final Exception e ) { return false; } return true; diff --git a/forester/java/src/org/forester/ws/uniprot/SequenceDatabaseEntry.java b/forester/java/src/org/forester/ws/uniprot/SequenceDatabaseEntry.java new file mode 100644 index 0000000..e62ee5f --- /dev/null +++ b/forester/java/src/org/forester/ws/uniprot/SequenceDatabaseEntry.java @@ -0,0 +1,39 @@ +// $Id: +// forester -- software libraries and applications +// for genomics and evolutionary biology research. +// +// Copyright (C) 2010 Christian M Zmasek +// Copyright (C) 2010 Sanford-Burnham Medical Research Institute +// All rights reserved +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +// +// Contact: phylosoft @ gmail . com +// WWW: www.phylosoft.org/forester + +package org.forester.ws.uniprot; + +public interface SequenceDatabaseEntry { + + public String getAccession(); + + public String getSequenceName(); + + public String getTaxonomyScientificName(); + + public String getTaxonomyIdentifier(); + + public String getSequenceSymbol(); +} \ No newline at end of file diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java b/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java index 906473c..1ee4cf5 100644 --- a/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/uniprot/UniProtEntry.java @@ -27,7 +27,7 @@ package org.forester.ws.uniprot; import java.util.List; -public final class UniProtEntry { +public final class UniProtEntry implements SequenceDatabaseEntry { private String _ac; private String _rec_name; @@ -38,7 +38,7 @@ public final class UniProtEntry { private UniProtEntry() { } - public static UniProtEntry createInstanceFromPlainText( final List lines ) { + public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { final UniProtEntry e = new UniProtEntry(); for( final String line : lines ) { if ( line.startsWith( "AC" ) ) { @@ -81,7 +81,11 @@ public final class UniProtEntry { return target.substring( i_a + a.length(), i_b ).trim(); } - public String getAc() { + /* (non-Javadoc) + * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getAc() + */ + @Override + public String getAccession() { return _ac; } @@ -91,7 +95,11 @@ public final class UniProtEntry { } } - public String getRecName() { + /* (non-Javadoc) + * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getRecName() + */ + @Override + public String getSequenceName() { return _rec_name; } @@ -101,7 +109,11 @@ public final class UniProtEntry { } } - public String getOsScientificName() { + /* (non-Javadoc) + * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getOsScientificName() + */ + @Override + public String getTaxonomyScientificName() { return _os_scientific_name; } @@ -111,7 +123,11 @@ public final class UniProtEntry { } } - public String getTaxId() { + /* (non-Javadoc) + * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getTaxId() + */ + @Override + public String getTaxonomyIdentifier() { return _tax_id; } @@ -121,7 +137,11 @@ public final class UniProtEntry { } } - public String getSymbol() { + /* (non-Javadoc) + * @see org.forester.ws.uniprot.SequenceDatabaseEntry#getSymbol() + */ + @Override + public String getSequenceSymbol() { return _symbol; } diff --git a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java index 1ecf5ea..8afb9eb 100644 --- a/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java +++ b/forester/java/src/org/forester/ws/uniprot/UniProtWsTools.java @@ -213,4 +213,10 @@ public final class UniProtWsTools { in.close(); return result; } + + public static SequenceDatabaseEntry obtainUniProtEntry( final String query, final int max_lines_to_return ) + throws IOException { + final List lines = queryUniprot( "uniprot/" + query + ".txt", max_lines_to_return ); + return UniProtEntry.createInstanceFromPlainText( lines ); + } }