X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=forester%2Fjava%2Fsrc%2Forg%2Fforester%2Fws%2Fseqdb%2FUniProtEntry.java;h=ba2d7a402f0b9b7edc647be03e0da1f1a5204686;hb=886c0c0a7a7cef72503df9f21762db1dab594362;hp=a3ea1e383c86bc63957998785640a78e510e0540;hpb=10297bd8b8a4b4ab198a17a42fc6ff24ae2ed49b;p=jalview.git diff --git a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java index a3ea1e3..ba2d7a4 100644 --- a/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java +++ b/forester/java/src/org/forester/ws/seqdb/UniProtEntry.java @@ -42,18 +42,19 @@ import org.forester.util.ForesterUtil; public final class UniProtEntry implements SequenceDatabaseEntry { - public final static Pattern BindingDB_PATTERN = Pattern.compile( "BindingDB;\\s+([0-9A-Z]+);" ); - public final static Pattern CTD_PATTERN = Pattern.compile( "CTD;\\s+(\\d+);" ); - public final static Pattern DrugBank_PATTERN = Pattern.compile( "DrugBank;\\s+([0-9A-Z]+);\\s+([^\\.]+)" ); - public final static Pattern GO_PATTERN = Pattern.compile( "GO;\\s+(GO:\\d+);\\s+([PFC]):([^;]+);" ); - public final static Pattern KEGG_PATTERN = Pattern.compile( "KEGG;\\s+([a-z]+:[0-9]+);" ); - public final static Pattern MIM_PATTERN = Pattern.compile( "MIM;\\s+(\\d+);" ); - public final static Pattern NextBio_PATTERN = Pattern.compile( "NextBio;\\s+(\\d+);" ); - public final static Pattern Orphanet_PATTERN = Pattern.compile( "Orphanet;\\s+(\\d+);\\s+([^\\.]+)" ); - public final static Pattern PDB_PATTERN = Pattern.compile( "PDB;\\s+([0-9A-Z]{4});\\s+([^;]+)" ); - public final static Pattern PharmGKB_PATTERN = Pattern.compile( "PharmGKB;\\s+([0-9A-Z]+);" ); - public final static Pattern Reactome_PATTERN = Pattern.compile( "Reactome;\\s+([0-9A-Z]+);\\s+([^\\.]+)" ); - public final static Pattern HGNC_PATTERN = Pattern.compile( "HGNC;\\s+HGNC:(\\d+);" ); + public final static Pattern BindingDB_PATTERN = Pattern.compile( "BindingDB;\\s+([0-9A-Z]+);" ); + public final static Pattern CTD_PATTERN = Pattern.compile( "CTD;\\s+(\\d+);" ); + public final static Pattern DrugBank_PATTERN = Pattern.compile( "DrugBank;\\s+([0-9A-Z]+);\\s+([^\\.]+)" ); + public final static Pattern GO_PATTERN = Pattern.compile( "GO;\\s+(GO:\\d+);\\s+([PFC]):([^;]+);" ); + public final static Pattern KEGG_PATTERN = Pattern.compile( "KEGG;\\s+([a-z]+:[0-9]+);" ); + public final static Pattern MIM_PATTERN = Pattern.compile( "MIM;\\s+(\\d+);" ); + public final static Pattern NextBio_PATTERN = Pattern.compile( "NextBio;\\s+(\\d+);" ); + public final static Pattern Orphanet_PATTERN = Pattern.compile( "Orphanet;\\s+(\\d+);\\s+([^\\.]+)" ); + public final static Pattern PDB_PATTERN = Pattern.compile( "PDB;\\s+([0-9A-Z]{4});\\s+([^;]+)" ); + public final static Pattern PharmGKB_PATTERN = Pattern.compile( "PharmGKB;\\s+([0-9A-Z]+);" ); + public final static Pattern Reactome_PATTERN = Pattern.compile( "Reactome;\\s+([0-9A-Z]+);\\s+([^\\.]+)" ); + public final static Pattern HGNC_PATTERN = Pattern.compile( "HGNC;\\s+HGNC:(\\d+);" ); + public final static Pattern NCBI_TAXID_PATTERN = Pattern.compile( "NCBI_TaxID=(\\d+)" ); private String _ac; private SortedSet _cross_references; private String _gene_name; @@ -123,9 +124,24 @@ public final class UniProtEntry implements SequenceDatabaseEntry { && ForesterUtil.isEmpty( getTaxonomyScientificName() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) && ForesterUtil.isEmpty( getGeneName() ) && ForesterUtil.isEmpty( getTaxonomyIdentifier() ) && ForesterUtil.isEmpty( getSequenceSymbol() ) && ( ( getGoTerms() == null ) || getGoTerms().isEmpty() ) && ( ( getCrossReferences() == null ) || getCrossReferences() - .isEmpty() ) ); + .isEmpty() ) ); } + @Override + public String getMap() { + return null; + } + + @Override + public String getChromosome() { + return null; + } + + @Override + public MolecularSequence getMolecularSequence() { + return _mol_seq; + } + private void addCrossReference( final Accession accession ) { if ( _cross_references == null ) { _cross_references = new TreeSet(); @@ -178,32 +194,58 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } } - public static SequenceDatabaseEntry createInstanceFromPlainText( final List lines ) { + + + @Override + public SortedSet getAnnotations() { + return null; + } + + public final static SequenceDatabaseEntry createInstance( final List lines ) { final UniProtEntry e = new UniProtEntry(); boolean saw_sq = false; final StringBuffer sq_buffer = new StringBuffer(); boolean is_aa = false; for( final String line : lines ) { - //System.out.println( line ); if ( line.startsWith( "AC" ) ) { e.setAc( SequenceDbWsTools.extractFromTo( line, "AC", ";" ) ); } else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceName() ) ) { if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) ); + if ( line.indexOf( "{" ) > 0 ) { + e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", "{" ) ); + } + else { + e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) ); + } } else if ( ( line.indexOf( "SubName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) { - e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) ); + if ( line.indexOf( "{" ) > 0 ) { + e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", "{" ) ); + } + else { + e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) ); + } } } else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceSymbol() ) ) { if ( line.indexOf( "Short=" ) > 0 ) { - e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", ";" ) ); + if ( line.indexOf( "{" ) > 0 ) { + e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", "{" ) ); + } + else { + e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", ";" ) ); + } } } else if ( line.startsWith( "GN" ) && ForesterUtil.isEmpty( e.getGeneName() ) ) { if ( line.indexOf( "Name=" ) > 0 ) { - e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", ";" ) ); + if ( line.indexOf( "{" ) > 0 ) { + e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", "{" ) ); + } + else { + e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", ";" ) ); + } } } else if ( line.startsWith( "DR" ) ) { @@ -300,7 +342,10 @@ public final class UniProtEntry implements SequenceDatabaseEntry { } else if ( line.startsWith( "OX" ) ) { if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) { - e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) ); + final Matcher m = NCBI_TAXID_PATTERN.matcher( line ); + if ( m.find() ) { + e.setTaxId( m.group( 1 ) ); + } } } else if ( line.startsWith( "SQ" ) ) { @@ -313,29 +358,14 @@ public final class UniProtEntry implements SequenceDatabaseEntry { sq_buffer.append( line.replaceAll( "\\s+", "" ) ); } } - if ( ( sq_buffer.length() > 0 ) && is_aa ) { - e.setMolecularSequence( BasicSequence.createAaSequence( e.getAccession(), sq_buffer.toString() ) ); + if ( sq_buffer.length() > 0 ) { + if ( is_aa ) { + e.setMolecularSequence( BasicSequence.createAaSequence( e.getAccession(), sq_buffer.toString() ) ); + } + else { + e.setMolecularSequence( BasicSequence.createDnaSequence( e.getAccession(), sq_buffer.toString() ) ); + } } return e; } - - @Override - public SortedSet getAnnotations() { - return null; - } - - @Override - public String getMap() { - return null; - } - - @Override - public String getChromosome() { - return null; - } - - @Override - public MolecularSequence getMolecularSequence() { - return _mol_seq; - } }