- /**\r
- * A whitespace character: [\t\n\x0B\f\r]\r
- */\r
- public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
-\r
- /**\r
- * A digit\r
- */\r
- public static final Pattern DIGIT = Pattern.compile("\\d");\r
-\r
- /**\r
- * Non word\r
- */\r
- public static final Pattern NONWORD = Pattern.compile("\\W");\r
-\r
- /**\r
- * Valid Amino acids\r
- */\r
- public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
- Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * inversion of AA pattern\r
- */\r
- public static final Pattern NON_AA = Pattern.compile(\r
- "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * Same as AA pattern but with two additional letters - XU\r
- */\r
- public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
- "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * Nucleotides a, t, g, c, u\r
- */\r
- public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
- Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * Ambiguous nucleotide\r
- */\r
- public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
- "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
- /**\r
- * Non nucleotide\r
- */\r
- public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
- Pattern.CASE_INSENSITIVE);\r
-\r
- private SequenceUtil() {\r
- } // utility class, no instantiation\r
-\r
- /*\r
- * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
- * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
- * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
- * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
- * SysPrefs.newlinechar); pir_out.close(); }\r
- * \r
- * public static void write_FastaSeq(OutputStream os, FastaSequence seq)\r
- * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new\r
- * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
- * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
- * SysPrefs.newlinechar); fasta_out.close(); }\r
- */\r
-\r
- /**\r
- * @return true is the sequence contains only letters a,c, t, g, u\r
- */\r
- public static boolean isNucleotideSequence(final FastaSequence s) {\r
- return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
- }\r
-\r
- /**\r
- * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
- * (!) - B char\r
- */\r
- public static boolean isNonAmbNucleotideSequence(String sequence) {\r
- sequence = SequenceUtil.cleanSequence(sequence);\r
- if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
- return false;\r
+ /**\r
+ * A whitespace character: [\t\n\x0B\f\r]\r
+ */\r
+ public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
+\r
+ /**\r
+ * A digit\r
+ */\r
+ public static final Pattern DIGIT = Pattern.compile("\\d");\r
+\r
+ /**\r
+ * Non word\r
+ */\r
+ public static final Pattern NONWORD = Pattern.compile("\\W");\r
+\r
+ /**\r
+ * Valid Amino acids\r
+ */\r
+ public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
+ Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * inversion of AA pattern\r
+ */\r
+ public static final Pattern NON_AA = Pattern.compile(\r
+ "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * Same as AA pattern but with two additional letters - XU\r
+ */\r
+ public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
+ "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * Nucleotides a, t, g, c, u\r
+ */\r
+ public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
+ Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * Ambiguous nucleotide\r
+ */\r
+ public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
+ "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
+ /**\r
+ * Non nucleotide\r
+ */\r
+ public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
+ Pattern.CASE_INSENSITIVE);\r
+\r
+ private SequenceUtil() {\r
+ } // utility class, no instantiation\r
+\r
+ /*\r
+ * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
+ * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
+ * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
+ * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
+ * SysPrefs.newlinechar); pir_out.close(); } public static void\r
+ * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
+ * BufferedWriter fasta_out = new BufferedWriter( new\r
+ * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
+ * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
+ * SysPrefs.newlinechar); fasta_out.close(); }\r
+ */\r
+\r
+ /**\r
+ * @return true is the sequence contains only letters a,c, t, g, u\r
+ */\r
+ public static boolean isNucleotideSequence(final FastaSequence s) {\r
+ return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
+ }\r
+\r
+ /**\r
+ * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
+ * (!) - B char\r
+ */\r
+ public static boolean isNonAmbNucleotideSequence(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
+ return false;\r
+ /*\r
+ * System.out.format("I found the text starting at " +\r
+ * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
+ * nonDNAmatcher.end());\r
+ */\r
+ }\r
+ final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
+ return DNAmatcher.find();\r
+ }\r
+\r
+ /**\r
+ * Removes all whitespace chars in the sequence string\r
+ * \r
+ * @param sequence\r
+ * @return cleaned up sequence\r
+ */\r
+ public static String cleanSequence(String sequence) {\r
+ assert sequence != null;\r
+ final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
+ sequence = m.replaceAll("").toUpperCase();\r
+ return sequence;\r
+ }\r
+\r
+ /**\r
+ * Removes all special characters and digits as well as whitespace chars\r
+ * from the sequence\r
+ * \r
+ * @param sequence\r
+ * @return cleaned up sequence\r
+ */\r
+ public static String deepCleanSequence(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
+ sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
+ final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
+ sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
+ return sequence;\r
+ }\r
+\r
+ /**\r
+ * @param sequence\r
+ * @return true is the sequence is a protein sequence, false overwise\r
+ */\r
+ public static boolean isProteinSequence(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
+ return protmatcher.find();\r
+ }\r
+\r
+ /**\r
+ * Check whether the sequence confirms to amboguous protein sequence\r
+ * \r
+ * @param sequence\r
+ * @return return true only if the sequence if ambiguous protein sequence\r
+ * Return false otherwise. e.g. if the sequence is non-ambiguous\r
+ * protein or DNA\r
+ */\r
+ public static boolean isAmbiguosProtein(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.AA.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
+ return amb_prot.find();\r
+ }\r
+\r
+ /**\r
+ * Writes list of FastaSequeces into the outstream formatting the sequence\r
+ * so that it contains width chars on each line\r
+ * \r
+ * @param outstream\r
+ * @param sequences\r
+ * @param width\r
+ * - the maximum number of characters to write in one line\r
+ * @throws IOException\r
+ */\r
+ public static void writeFasta(final OutputStream outstream,\r
+ final List<FastaSequence> sequences, final int width)\r
+ throws IOException {\r
+ writeFastaKeepTheStream(outstream, sequences, width);\r
+ outstream.close();\r
+ }\r
+\r
+ public static void writeFastaKeepTheStream(final OutputStream outstream,\r
+ final List<FastaSequence> sequences, final int width)\r
+ throws IOException {\r
+ final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
+ final BufferedWriter fastawriter = new BufferedWriter(writer);\r
+ for (final FastaSequence fs : sequences) {\r
+ fastawriter.write(">" + fs.getId() + "\n");\r
+ fastawriter.write(fs.getFormatedSequence(width));\r
+ fastawriter.write("\n");\r
+ }\r
+ fastawriter.flush();\r
+ writer.flush();\r