// underscore character ('_'). For example, a RefSeq protein accession is NP_015325. \r
private final static Pattern REFSEQ_PATTERN = Pattern\r
.compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z]{2}_\\d{6,})(?:[^a-zA-Z0-9]|\\Z)" );\r
+ // See: http://web.expasy.org/docs/userman.html#ID_line\r
+ private final static Pattern TREMBL_PATTERN = Pattern\r
+ .compile( "(?:\\A|.*[^a-zA-Z0-9])([A-Z][0-9][A-Z0-9]{3}[0-9])(?:[^a-zA-Z0-9]|\\Z)" );\r
\r
/**\r
* Returns null if no match.\r
if ( !ForesterUtil.isEmpty( v ) ) {\r
return new Identifier( v, Identifier.REFSEQ );\r
}\r
+ v = parseTrEMBLAccessor( s );\r
+ if ( !ForesterUtil.isEmpty( v ) ) {\r
+ return new Identifier( v, Identifier.SP );\r
+ }\r
return null;\r
}\r
\r
- public static boolean isProtein( final String query ) {\r
+ public final static boolean isProtein( final String query ) {\r
+ final String r1 = parseRefSeqAccessor( query );\r
+ if ( !ForesterUtil.isEmpty( r1 ) && ( r1.charAt( 1 ) == 'P' ) ) {\r
+ return true;\r
+ }\r
+ final String r2 = parseTrEMBLAccessor( query );\r
+ if ( !ForesterUtil.isEmpty( r2 ) ) {\r
+ return true;\r
+ }\r
return GENBANK_PROTEIN_AC_PATTERN.matcher( query ).lookingAt();\r
}\r
\r
return null;\r
}\r
\r
+ /**\r
+ * Returns null if no match.\r
+ * \r
+ */\r
+ private final static String parseTrEMBLAccessor( final String query ) {\r
+ final Matcher m = TREMBL_PATTERN.matcher( query );\r
+ if ( m.lookingAt() ) {\r
+ return m.group( 1 );\r
+ }\r
+ return null;\r
+ }\r
+\r
private SequenceIdParser() {\r
// Hiding the constructor.\r
}\r