X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fws%2Fseqdb%2FUniProtEntry.java;h=a3ea1e383c86bc63957998785640a78e510e0540;hb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;hp=2eaa7204aac4c2e89f278eee4fb2c1a6f08b9510;hpb=7e2a839d55608212fed645ce9ffe3a3f4952fb17;p=jalview.git diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java index 2eaa720..a3ea1e3 100644 --- a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java @@ -25,8 +25,9 @@ package org.forester.ws.seqdb; -import java.util.ArrayList; import java.util.List; +import java.util.SortedSet; +import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -34,6 +35,9 @@ import org.forester.go.BasicGoTerm; import org.forester.go.GoNameSpace; import org.forester.go.GoTerm; import org.forester.phylogeny.data.Accession; +import org.forester.phylogeny.data.Annotation; +import org.forester.sequence.BasicSequence; +import org.forester.sequence.MolecularSequence; import org.forester.util.ForesterUtil; public final class UniProtEntry implements SequenceDatabaseEntry { @@ -49,14 +53,16 @@ public final class UniProtEntry implements SequenceDatabaseEntry { public final static Pattern PDB_PATTERN = Pattern.compile( "PDB;\\s+([0-9A-Z]{4});\\s+([^;]+)" ); public final static Pattern PharmGKB_PATTERN = Pattern.compile( "PharmGKB;\\s+([0-9A-Z]+);" ); public final static Pattern Reactome_PATTERN = Pattern.compile( "Reactome;\\s+([0-9A-Z]+);\\s+([^\\.]+)" ); + public final static Pattern HGNC_PATTERN = Pattern.compile( "HGNC;\\s+HGNC:(\\d+);" ); private String _ac; - private ArrayList _cross_references; + private SortedSet _cross_references; private String _gene_name; - private List _go_terms; + private SortedSet _go_terms; private String _name; private String _os_scientific_name; private String _symbol; private String _tax_id; + private MolecularSequence _mol_seq; private UniProtEntry() { } @@ -72,7 +78,7 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } @Override - public List getCrossReferences() { + public SortedSet getCrossReferences() { return _cross_references; } @@ -82,7 +88,7 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } @Override - public List getGoTerms() { + public SortedSet getGoTerms() { return _go_terms; } @@ -117,22 +123,20 @@ public final class UniProtEntry implements SequenceDatabaseEntry { && ForesterUtil.isEmpty( getTaxonomyScientificName() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) && ForesterUtil.isEmpty( getGeneName() ) && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) && ( ( getGoTerms() == null ) || getGoTerms().isEmpty() ) && ( ( getCrossReferences() == null ) || getCrossReferences() - .isEmpty() ) ); + .isEmpty() ) ); } private void addCrossReference( final Accession accession ) { if ( _cross_references == null ) { - _cross_references = new ArrayList(); + _cross_references = new TreeSet(); } - System.out.println( "XREF ADDED: " + accession ); _cross_references.add( accession ); } private void addGoTerm( final BasicGoTerm g ) { if ( _go_terms == null ) { - _go_terms = new ArrayList(); + _go_terms = new TreeSet(); } - System.out.println( "GOTERM ADDED: " + g ); _go_terms.add( g ); } @@ -142,6 +146,10 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } } + private void setMolecularSequence( final MolecularSequence mol_seq ) { + _mol_seq = mol_seq; + } + private void setGeneName( final String gene_name ) { if ( _gene_name == null ) { _gene_name = gene_name; @@ -172,27 +180,30 @@ public final class UniProtEntry implements SequenceDatabaseEntry { public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { final UniProtEntry e = new UniProtEntry(); + boolean saw_sq = false; + final StringBuffer sq_buffer = new StringBuffer(); + boolean is_aa = false; for( final String line : lines ) { //System.out.println( line ); if ( line.startsWith( "AC" ) ) { - e.setAc( DatabaseTools.extract( line, "AC", ";" ) ); + e.setAc( SequenceDbWsTools.extractFromTo( line, "AC", ";" ) ); } else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceName() ) ) { if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) ); + e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) ); } else if ( ( line.indexOf( "SubName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setSequenceName( DatabaseTools.extract( line, "Full=", ";" ) ); + e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) ); } } else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceSymbol() ) ) { if ( line.indexOf( "Short=" ) > 0 ) { - e.setSequenceSymbol( DatabaseTools.extract( line, "Short=", ";" ) ); + e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", ";" ) ); } } else if ( line.startsWith( "GN" ) && ForesterUtil.isEmpty( e.getGeneName() ) ) { if ( line.indexOf( "Name=" ) > 0 ) { - e.setGeneName( DatabaseTools.extract( line, "Name=", ";" ) ); + e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", ";" ) ); } } else if ( line.startsWith( "DR" ) ) { @@ -209,7 +220,6 @@ public final class UniProtEntry implements SequenceDatabaseEntry { else if ( ns_str.equals( "C" ) ) { gns = GoNameSpace.CELLULAR_COMPONENT_STR; } - System.out.println( "GO:" + id + " " + desc + " " + ns_str ); e.addGoTerm( new BasicGoTerm( id, desc, gns, false ) ); } } @@ -273,21 +283,59 @@ public final class UniProtEntry implements SequenceDatabaseEntry { e.addCrossReference( new Accession( m.group( 1 ), "Reactome", m.group( 2 ) ) ); } } + else if ( line.indexOf( "HGNC;" ) > 0 ) { + final Matcher m = HGNC_PATTERN.matcher( line ); + if ( m.find() ) { + e.addCrossReference( new Accession( m.group( 1 ), "HGNC" ) ); + } + } } else if ( line.startsWith( "OS" ) ) { if ( line.indexOf( "(" ) > 0 ) { - e.setOsScientificName( DatabaseTools.extract( line, "OS", "(" ) ); + e.setOsScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "(" ) ); } else { - e.setOsScientificName( DatabaseTools.extract( line, "OS", "." ) ); + e.setOsScientificName( SequenceDbWsTools.extractFromTo( line, "OS", "." ) ); } } else if ( line.startsWith( "OX" ) ) { if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { - e.setTaxId( DatabaseTools.extract( line, "NCBI_TaxID=", ";" ) ); + e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) ); } } + else if ( line.startsWith( "SQ" ) ) { + saw_sq = true; + if ( line.contains( "AA;" ) ) { + is_aa = true; + } + } + else if ( saw_sq && line.startsWith( " " ) ) { + sq_buffer.append( line.replaceAll( "\\s+", "" ) ); + } + } + if ( ( sq_buffer.length() > 0 ) && is_aa ) { + e.setMolecularSequence( BasicSequence.createAaSequence( e.getAccession(), sq_buffer.toString() ) ); } return e; } + + @Override + public SortedSet getAnnotations() { + return null; + } + + @Override + public String getMap() { + return null; + } + + @Override + public String getChromosome() { + return null; + } + + @Override + public MolecularSequence getMolecularSequence() { + return _mol_seq; + } }