From: pvtroshin Date: Thu, 28 Apr 2011 15:17:21 +0000 (+0000) Subject: Add method to SequenceUtil to clean the protein sequence X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=a16efab90029be12793ac062994de9627b8f3046;p=jabaws.git Add method to SequenceUtil to clean the protein sequence git-svn-id: link to svn.lifesci.dundee.ac.uk/svn/barton/ptroshin/JABA2@4047 e3abac25-378b-4346-85de-24260fe3988d --- diff --git a/build.xml b/build.xml index fc99421..3d4535e 100644 --- a/build.xml +++ b/build.xml @@ -128,23 +128,7 @@ - - - - - - - - - - - - - - - - - + diff --git a/datamodel/compbio/data/sequence/SequenceUtil.java b/datamodel/compbio/data/sequence/SequenceUtil.java index d0a6cd7..4d61d2a 100644 --- a/datamodel/compbio/data/sequence/SequenceUtil.java +++ b/datamodel/compbio/data/sequence/SequenceUtil.java @@ -170,6 +170,17 @@ public final class SequenceUtil { } /** + * Remove all non AA chars from the sequence + * + * @param sequence + * the sequence to clean + * @return cleaned sequence + */ + public static String cleanProteinSequence(String sequence) { + return SequenceUtil.NON_AA.matcher(sequence).replaceAll(""); + } + + /** * @param sequence * @return true is the sequence is a protein sequence, false overwise */ diff --git a/testsrc/compbio/data/sequence/SequenceUtilTester.java b/testsrc/compbio/data/sequence/SequenceUtilTester.java index b6e74ae..3e351bf 100644 --- a/testsrc/compbio/data/sequence/SequenceUtilTester.java +++ b/testsrc/compbio/data/sequence/SequenceUtilTester.java @@ -83,6 +83,31 @@ public class SequenceUtilTester { } @Test() + public void testCleanProteinSequence() { + String dirtySeq = "atgAGTggt\taGGTgc\ncgcAC\rTgc gACtcgcGAt cgA "; + assertFalse(SequenceUtil.isProteinSequence(dirtySeq)); + // This will still be NON protein sequence despite having only correct + // letters because the letters match perfectly the nucleotide sequence! + assertFalse(SequenceUtil.isProteinSequence(SequenceUtil + .cleanProteinSequence(dirtySeq))); + + String notaSeq = "atgc1tgatgcatgcatgatgmctga"; + assertFalse(SequenceUtil.isProteinSequence(notaSeq)); + assertTrue(SequenceUtil.isProteinSequence(SequenceUtil + .cleanProteinSequence(notaSeq))); + + String AAseq = "ARLGRVRWTQQRHAEAAVLLQQASDAAPEHPGIALWLGHALEDAGQAEAAAAAYTRAHQL"; + assertTrue(SequenceUtil.isProteinSequence(AAseq)); + assertTrue(SequenceUtil.isProteinSequence(SequenceUtil + .cleanProteinSequence(AAseq))); + AAseq += "XU"; + + assertFalse(SequenceUtil.isProteinSequence(AAseq)); + assertTrue(SequenceUtil.isProteinSequence(SequenceUtil + .cleanProteinSequence(AAseq))); + } + + @Test() public void testReadWriteFasta() { try { diff --git a/website/archive/datamodel-1.2.jar b/website/archive/datamodel-1.2.jar index 4c7cffc..ecbf161 100644 Binary files a/website/archive/datamodel-1.2.jar and b/website/archive/datamodel-1.2.jar differ diff --git a/website/archive/datamodel-src-1.2.jar b/website/archive/datamodel-src-1.2.jar index 6b324de..06610ed 100644 Binary files a/website/archive/datamodel-src-1.2.jar and b/website/archive/datamodel-src-1.2.jar differ