public final static Pattern PDB_PATTERN = Pattern.compile( "PDB;\\s+([0-9A-Z]{4});\\s+([^;]+)" );
public final static Pattern PharmGKB_PATTERN = Pattern.compile( "PharmGKB;\\s+([0-9A-Z]+);" );
public final static Pattern Reactome_PATTERN = Pattern.compile( "Reactome;\\s+([0-9A-Z]+);\\s+([^\\.]+)" );
+ public final static Pattern HGNC_PATTERN = Pattern.compile( "HGNC;\\s+HGNC:(\\d+);" );
+ public final static Pattern NCBI_TAXID_PATTERN= Pattern.compile( "NCBI_TaxID=(\\d+)" );
+
private String _ac;
private SortedSet<Accession> _cross_references;
private String _gene_name;
&& ForesterUtil.isEmpty( getTaxonomyScientificName() ) && ForesterUtil.isEmpty( getSequenceSymbol() )
&& ForesterUtil.isEmpty( getGeneName() ) && ForesterUtil.isEmpty( getTaxonomyIdentifier() )
&& ForesterUtil.isEmpty( getSequenceSymbol() ) && ( ( getGoTerms() == null ) || getGoTerms().isEmpty() ) && ( ( getCrossReferences() == null ) || getCrossReferences()
- .isEmpty() ) );
+ .isEmpty() ) );
}
private void addCrossReference( final Accession accession ) {
}
else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceName() ) ) {
if ( ( line.indexOf( "RecName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
- e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) );
+ if ( line.indexOf( "{" ) > 0 ) {
+ e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", "{" ) );
+ }
+ else {
+ e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) );
+ }
}
else if ( ( line.indexOf( "SubName:" ) > 0 ) && ( line.indexOf( "Full=" ) > 0 ) ) {
- e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) );
+ if ( line.indexOf( "{" ) > 0 ) {
+ e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", "{" ) );
+ }
+ else {
+ e.setSequenceName( SequenceDbWsTools.extractFromTo( line, "Full=", ";" ) );
+ }
+
}
}
else if ( line.startsWith( "DE" ) && ForesterUtil.isEmpty( e.getSequenceSymbol() ) ) {
+
if ( line.indexOf( "Short=" ) > 0 ) {
- e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", ";" ) );
+ if ( line.indexOf( "{" ) > 0 ) {
+ e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", "{" ) );
+ }
+ else {
+ e.setSequenceSymbol( SequenceDbWsTools.extractFromTo( line, "Short=", ";" ) );
+ }
+
}
}
else if ( line.startsWith( "GN" ) && ForesterUtil.isEmpty( e.getGeneName() ) ) {
if ( line.indexOf( "Name=" ) > 0 ) {
- e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", ";" ) );
+ if ( line.indexOf( "{" ) > 0 ) {
+ e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", "{" ) );
+ }
+ else {
+ e.setGeneName( SequenceDbWsTools.extractFromTo( line, "Name=", ";" ) );
+ }
}
}
else if ( line.startsWith( "DR" ) ) {
e.addCrossReference( new Accession( m.group( 1 ), "Reactome", m.group( 2 ) ) );
}
}
+ else if ( line.indexOf( "HGNC;" ) > 0 ) {
+ final Matcher m = HGNC_PATTERN.matcher( line );
+ if ( m.find() ) {
+ e.addCrossReference( new Accession( m.group( 1 ), "HGNC" ) );
+ }
+ }
}
else if ( line.startsWith( "OS" ) ) {
if ( line.indexOf( "(" ) > 0 ) {
}
else if ( line.startsWith( "OX" ) ) {
if ( line.indexOf( "NCBI_TaxID=" ) > 0 ) {
- e.setTaxId( SequenceDbWsTools.extractFromTo( line, "NCBI_TaxID=", ";" ) );
+ final Matcher m = NCBI_TAXID_PATTERN.matcher( line );
+ if ( m.find() ) {
+ e.setTaxId( m.group( 1 ) );
+ }
}
}
else if ( line.startsWith( "SQ" ) ) {