1 /* Copyright (c) 2009 Peter Troshin
\r
3 * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0
\r
5 * This library is free software; you can redistribute it and/or modify it under the terms of the
\r
6 * Apache License version 2 as published by the Apache Software Foundation
\r
8 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
\r
9 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache
\r
10 * License for more details.
\r
12 * A copy of the license is in apache_license.txt. It is also available here:
\r
13 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
\r
15 * Any republication or derived work distributed in source code form
\r
16 * must include this copyright and license notice.
\r
19 package compbio.data.sequence;
\r
21 import java.io.BufferedReader;
\r
22 import java.io.BufferedWriter;
\r
23 import java.io.Closeable;
\r
24 import java.io.IOException;
\r
25 import java.io.InputStream;
\r
26 import java.io.InputStreamReader;
\r
27 import java.io.OutputStream;
\r
28 import java.io.OutputStreamWriter;
\r
29 import java.util.ArrayList;
\r
30 import java.util.List;
\r
31 import java.util.logging.Level;
\r
32 import java.util.regex.Matcher;
\r
33 import java.util.regex.Pattern;
\r
36 * Utility class for operations on sequences
\r
40 * Date September 2009
\r
42 public final class SequenceUtil {
\r
45 * A whitespace character: [\t\n\x0B\f\r]
\r
47 public static final Pattern WHITE_SPACE = Pattern.compile("\\s");
\r
52 public static final Pattern DIGIT = Pattern.compile("\\d");
\r
57 public static final Pattern NONWORD = Pattern.compile("\\W");
\r
62 public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",
\r
63 Pattern.CASE_INSENSITIVE);
\r
66 * inversion of AA pattern
\r
68 public static final Pattern NON_AA = Pattern.compile(
\r
69 "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);
\r
72 * Same as AA pattern but with two additional letters - XU
\r
74 public static final Pattern AMBIGUOUS_AA = Pattern.compile(
\r
75 "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);
\r
78 * Nucleotides a, t, g, c, u
\r
80 public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",
\r
81 Pattern.CASE_INSENSITIVE);
\r
84 * Ambiguous nucleotide
\r
86 public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(
\r
87 "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC
\r
91 public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",
\r
92 Pattern.CASE_INSENSITIVE);
\r
94 private SequenceUtil() {
\r
95 } // utility class, no instantiation
\r
98 * public static void write_PirSeq(OutputStream os, FastaSequence seq)
\r
99 * throws IOException { BufferedWriter pir_out = new BufferedWriter(new
\r
100 * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +
\r
101 * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +
\r
102 * SysPrefs.newlinechar); pir_out.close(); }
\r
104 * public static void write_FastaSeq(OutputStream os, FastaSequence seq)
\r
105 * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new
\r
106 * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +
\r
107 * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +
\r
108 * SysPrefs.newlinechar); fasta_out.close(); }
\r
112 * @return true is the sequence contains only letters a,c, t, g, u
\r
114 public static boolean isNucleotideSequence(FastaSequence s) {
\r
115 return isNonAmbNucleotideSequence(s.getSequence());
\r
119 * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one
\r
122 public static boolean isNonAmbNucleotideSequence(String sequence) {
\r
123 sequence = cleanSequence(sequence);
\r
124 if (DIGIT.matcher(sequence).find()) {
\r
127 if (NON_NUCLEOTIDE.matcher(sequence).find()) {
\r
130 * System.out.format("I found the text starting at " +
\r
131 * "index %d and ending at index %d.%n", nonDNAmatcher .start(),
\r
132 * nonDNAmatcher.end());
\r
135 Matcher DNAmatcher = NUCLEOTIDE.matcher(sequence);
\r
136 return DNAmatcher.find();
\r
140 * Removes all whitespace chars in the sequence string
\r
143 * @return cleaned up sequence
\r
145 public static String cleanSequence(String sequence) {
\r
146 assert sequence != null;
\r
147 final Matcher m = WHITE_SPACE.matcher(sequence);
\r
148 sequence = m.replaceAll("").toUpperCase();
\r
153 * Removes all special characters and digits as well as whitespace chars
\r
154 * from the sequence
\r
157 * @return cleaned up sequence
\r
159 public static String deepCleanSequence(String sequence) {
\r
160 sequence = cleanSequence(sequence);
\r
161 sequence = DIGIT.matcher(sequence).replaceAll("");
\r
162 sequence = NONWORD.matcher(sequence).replaceAll("");
\r
163 Pattern othernonSeqChars = Pattern.compile("[_-]+");
\r
164 sequence = othernonSeqChars.matcher(sequence).replaceAll("");
\r
171 * @return true is the sequence is a protein sequence, false overwise
\r
173 public static boolean isProteinSequence(String sequence) {
\r
174 sequence = cleanSequence(sequence);
\r
175 if (isNonAmbNucleotideSequence(sequence)) {
\r
178 if (DIGIT.matcher(sequence).find()) {
\r
181 if (NON_AA.matcher(sequence).find()) {
\r
184 Matcher protmatcher = AA.matcher(sequence);
\r
185 return protmatcher.find();
\r
189 * Check whether the sequence confirms to amboguous protein sequence
\r
192 * @return return true only if the sequence if ambiguous protein sequence
\r
193 * Return false otherwise. e.g. if the sequence is non-ambiguous
\r
196 public static boolean isAmbiguosProtein(String sequence) {
\r
197 sequence = cleanSequence(sequence);
\r
198 if (isNonAmbNucleotideSequence(sequence)) {
\r
201 if (DIGIT.matcher(sequence).find()) {
\r
204 if (NON_AA.matcher(sequence).find()) {
\r
207 if (AA.matcher(sequence).find()) {
\r
210 Matcher amb_prot = AMBIGUOUS_AA.matcher(sequence);
\r
211 return amb_prot.find();
\r
215 * Writes list of FastaSequeces into the outstream formatting the sequence
\r
216 * so that it contains width chars on each line
\r
221 * - the maximum number of characters to write in one line
\r
222 * @throws IOException
\r
224 public static void writeFasta(OutputStream outstream,
\r
225 List<FastaSequence> sequences, int width) throws IOException {
\r
226 OutputStreamWriter writer = new OutputStreamWriter(outstream);
\r
227 BufferedWriter fastawriter = new BufferedWriter(writer);
\r
228 for (FastaSequence fs : sequences) {
\r
229 fastawriter.write(fs.getOnelineFasta());
\r
232 fastawriter.close();
\r
237 * Reads fasta sequences from inStream into the list of FastaSequence
\r
242 * @return list of FastaSequence objects
\r
243 * @throws IOException
\r
245 public static List<FastaSequence> readFasta(InputStream inStream)
\r
246 throws IOException {
\r
247 List<FastaSequence> seqs = new ArrayList<FastaSequence>();
\r
248 InputStreamReader inReader = new InputStreamReader(inStream);
\r
249 BufferedReader infasta = new BufferedReader(inReader);
\r
250 Pattern pattern = Pattern.compile("//s+");
\r
253 String sname = "", seqstr = null;
\r
255 line = infasta.readLine();
\r
256 if (line == null || line.startsWith(">")) {
\r
257 if (seqstr != null)
\r
258 seqs.add(new FastaSequence(sname.substring(1), seqstr));
\r
259 sname = line; // remove >
\r
262 String subseq = pattern.matcher(line).replaceAll("");
\r
265 } while (line != null);
\r
272 * Writes FastaSequence in the file, each sequence will take one line only
\r
276 * @throws IOException
\r
278 public static void writeFasta(OutputStream os, List<FastaSequence> sequences)
\r
279 throws IOException {
\r
280 OutputStreamWriter outWriter = new OutputStreamWriter(os);
\r
281 BufferedWriter fasta_out = new BufferedWriter(outWriter);
\r
282 for (FastaSequence fs : sequences) {
\r
283 fasta_out.write(fs.getOnelineFasta());
\r
290 * Closes the Closable and logs the exception if any
\r
295 public final static void closeSilently(java.util.logging.Logger log,
\r
296 Closeable stream) {
\r
297 if (stream != null) {
\r
300 } catch (IOException e) {
\r
301 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());
\r