2 * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin
\r
3 * Jalview Web Services version: 2.0 This library is free software; you can
\r
4 * redistribute it and/or modify it under the terms of the Apache License
\r
5 * version 2 as published by the Apache Software Foundation This library is
\r
6 * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
\r
7 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
\r
8 * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the
\r
9 * license is in apache_license.txt. It is also available here: see:
\r
10 * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived
\r
11 * work distributed in source code form must include this copyright and license
\r
15 package compbio.data.sequence;
\r
17 import java.io.BufferedReader;
\r
18 import java.io.BufferedWriter;
\r
19 import java.io.Closeable;
\r
20 import java.io.File;
\r
21 import java.io.FileInputStream;
\r
22 import java.io.IOException;
\r
23 import java.io.InputStream;
\r
24 import java.io.InputStreamReader;
\r
25 import java.io.OutputStream;
\r
26 import java.io.OutputStreamWriter;
\r
27 import java.util.ArrayList;
\r
28 import java.util.List;
\r
29 import java.util.Scanner;
\r
30 import java.util.logging.Level;
\r
31 import java.util.regex.Matcher;
\r
32 import java.util.regex.Pattern;
\r
34 import compbio.conservation.Method;
\r
37 * Utility class for operations on sequences
\r
39 * @author Petr Troshin
\r
42 public final class SequenceUtil {
\r
45 * A whitespace character: [\t\n\x0B\f\r]
\r
47 public static final Pattern WHITE_SPACE = Pattern.compile("\\s");
\r
52 public static final Pattern DIGIT = Pattern.compile("\\d");
\r
57 public static final Pattern NONWORD = Pattern.compile("\\W");
\r
62 public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",
\r
63 Pattern.CASE_INSENSITIVE);
\r
66 * inversion of AA pattern
\r
68 public static final Pattern NON_AA = Pattern.compile(
\r
69 "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);
\r
72 * Same as AA pattern but with two additional letters - XU
\r
74 public static final Pattern AMBIGUOUS_AA = Pattern.compile(
\r
75 "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);
\r
78 * Nucleotides a, t, g, c, u
\r
80 public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",
\r
81 Pattern.CASE_INSENSITIVE);
\r
84 * Ambiguous nucleotide
\r
86 public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(
\r
87 "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC
\r
91 public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",
\r
92 Pattern.CASE_INSENSITIVE);
\r
94 private SequenceUtil() {
\r
95 } // utility class, no instantiation
\r
98 * public static void write_PirSeq(OutputStream os, FastaSequence seq)
\r
99 * throws IOException { BufferedWriter pir_out = new BufferedWriter(new
\r
100 * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +
\r
101 * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +
\r
102 * SysPrefs.newlinechar); pir_out.close(); } public static void
\r
103 * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {
\r
104 * BufferedWriter fasta_out = new BufferedWriter( new
\r
105 * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +
\r
106 * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +
\r
107 * SysPrefs.newlinechar); fasta_out.close(); }
\r
111 * @return true is the sequence contains only letters a,c, t, g, u
\r
113 public static boolean isNucleotideSequence(final FastaSequence s) {
\r
114 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());
\r
118 * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one
\r
121 public static boolean isNonAmbNucleotideSequence(String sequence) {
\r
122 sequence = SequenceUtil.cleanSequence(sequence);
\r
123 if (SequenceUtil.DIGIT.matcher(sequence).find()) {
\r
126 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {
\r
129 * System.out.format("I found the text starting at " +
\r
130 * "index %d and ending at index %d.%n", nonDNAmatcher .start(),
\r
131 * nonDNAmatcher.end());
\r
134 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);
\r
135 return DNAmatcher.find();
\r
139 * Removes all whitespace chars in the sequence string
\r
142 * @return cleaned up sequence
\r
144 public static String cleanSequence(String sequence) {
\r
145 assert sequence != null;
\r
146 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);
\r
147 sequence = m.replaceAll("").toUpperCase();
\r
152 * Removes all special characters and digits as well as whitespace chars
\r
153 * from the sequence
\r
156 * @return cleaned up sequence
\r
158 public static String deepCleanSequence(String sequence) {
\r
159 sequence = SequenceUtil.cleanSequence(sequence);
\r
160 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");
\r
161 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");
\r
162 final Pattern othernonSeqChars = Pattern.compile("[_-]+");
\r
163 sequence = othernonSeqChars.matcher(sequence).replaceAll("");
\r
169 * @return true is the sequence is a protein sequence, false overwise
\r
171 public static boolean isProteinSequence(String sequence) {
\r
172 sequence = SequenceUtil.cleanSequence(sequence);
\r
173 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {
\r
176 if (SequenceUtil.DIGIT.matcher(sequence).find()) {
\r
179 if (SequenceUtil.NON_AA.matcher(sequence).find()) {
\r
182 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);
\r
183 return protmatcher.find();
\r
187 * Check whether the sequence confirms to amboguous protein sequence
\r
190 * @return return true only if the sequence if ambiguous protein sequence
\r
191 * Return false otherwise. e.g. if the sequence is non-ambiguous
\r
194 public static boolean isAmbiguosProtein(String sequence) {
\r
195 sequence = SequenceUtil.cleanSequence(sequence);
\r
196 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {
\r
199 if (SequenceUtil.DIGIT.matcher(sequence).find()) {
\r
202 if (SequenceUtil.NON_AA.matcher(sequence).find()) {
\r
205 if (SequenceUtil.AA.matcher(sequence).find()) {
\r
208 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);
\r
209 return amb_prot.find();
\r
213 * Writes list of FastaSequeces into the outstream formatting the sequence
\r
214 * so that it contains width chars on each line
\r
219 * - the maximum number of characters to write in one line
\r
220 * @throws IOException
\r
222 public static void writeFasta(final OutputStream outstream,
\r
223 final List<FastaSequence> sequences, final int width)
\r
224 throws IOException {
\r
225 writeFastaKeepTheStream(outstream, sequences, width);
\r
229 public static void writeFastaKeepTheStream(final OutputStream outstream,
\r
230 final List<FastaSequence> sequences, final int width)
\r
231 throws IOException {
\r
232 final OutputStreamWriter writer = new OutputStreamWriter(outstream);
\r
233 final BufferedWriter fastawriter = new BufferedWriter(writer);
\r
234 for (final FastaSequence fs : sequences) {
\r
235 fastawriter.write(">" + fs.getId() + "\n");
\r
236 fastawriter.write(fs.getFormatedSequence(width));
\r
237 fastawriter.write("\n");
\r
239 fastawriter.flush();
\r
244 * Reads fasta sequences from inStream into the list of FastaSequence
\r
249 * @return list of FastaSequence objects
\r
250 * @throws IOException
\r
252 public static List<FastaSequence> readFasta(final InputStream inStream)
\r
253 throws IOException {
\r
254 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
\r
256 final BufferedReader infasta = new BufferedReader(
\r
257 new InputStreamReader(inStream, "UTF8"), 16000);
\r
258 final Pattern pattern = Pattern.compile("//s+");
\r
261 String sname = "", seqstr = null;
\r
263 line = infasta.readLine();
\r
264 if ((line == null) || line.startsWith(">")) {
\r
265 if (seqstr != null) {
\r
266 seqs.add(new FastaSequence(sname.substring(1), seqstr));
\r
268 sname = line; // remove >
\r
271 final String subseq = pattern.matcher(line).replaceAll("");
\r
274 } while (line != null);
\r
281 * Writes FastaSequence in the file, each sequence will take one line only
\r
285 * @throws IOException
\r
287 public static void writeFasta(final OutputStream os,
\r
288 final List<FastaSequence> sequences) throws IOException {
\r
289 final OutputStreamWriter outWriter = new OutputStreamWriter(os);
\r
290 final BufferedWriter fasta_out = new BufferedWriter(outWriter);
\r
291 for (final FastaSequence fs : sequences) {
\r
292 fasta_out.write(fs.getOnelineFasta());
\r
298 public static List<AnnotatedSequence> readJRonn(final File result)
\r
299 throws IOException, UnknownFileFormatException {
\r
300 InputStream input = new FileInputStream(result);
\r
301 List<AnnotatedSequence> sequences = readJRonn(input);
\r
307 * Reader for JRonn horizontal file format >Foobar M G D T T A G 0.48 0.42
\r
308 * 0.42 0.48 0.52 0.53 0.54 All values are tab delimited
\r
312 * @throws IOException
\r
313 * @throws UnknownFileFormatException
\r
315 public static List<AnnotatedSequence> readJRonn(final InputStream inStream)
\r
316 throws IOException, UnknownFileFormatException {
\r
317 final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();
\r
319 final BufferedReader infasta = new BufferedReader(
\r
320 new InputStreamReader(inStream, "UTF8"), 16000);
\r
325 line = infasta.readLine();
\r
326 if (line == null || line.isEmpty()) {
\r
327 // skip empty lines
\r
330 if (line.startsWith(">")) {
\r
332 sname = line.trim().substring(1);
\r
333 // read sequence line
\r
334 line = infasta.readLine();
\r
335 final String sequence = line.replace("\t", "");
\r
336 // read annotation line
\r
337 line = infasta.readLine();
\r
338 String[] annotValues = line.split("\t");
\r
339 float[] annotation = convertToNumber(annotValues);
\r
340 if (annotation.length != sequence.length()) {
\r
341 throw new UnknownFileFormatException(
\r
342 "File does not look like Jronn horizontally formatted output file!\n"
\r
343 + JRONN_WRONG_FORMAT_MESSAGE);
\r
345 seqs.add(new AnnotatedSequence(sname, sequence, annotation));
\r
347 } while (line != null);
\r
353 private static float[] convertToNumber(String[] annotValues)
\r
354 throws UnknownFileFormatException {
\r
355 float[] annotation = new float[annotValues.length];
\r
357 for (int i = 0; i < annotation.length; i++) {
\r
358 annotation[i] = Float.parseFloat(annotValues[i]);
\r
360 } catch (NumberFormatException e) {
\r
361 throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,
\r
367 private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"
\r
368 + ">sequence_name\n "
\r
370 + "0.43 0.22 0.65\n"
\r
371 + "Where first line is the sequence name,\n"
\r
372 + "second line is the tab delimited sequence,\n"
\r
373 + "third line contains tab delimited disorder prediction values.\n"
\r
374 + "No lines are allowed between these three. Additionally, the number of "
\r
375 + "sequence residues must be equal to the number of the disorder values.";
\r
378 * Closes the Closable and logs the exception if any
\r
383 public final static void closeSilently(java.util.logging.Logger log,
\r
384 Closeable stream) {
\r
385 if (stream != null) {
\r
388 } catch (IOException e) {
\r
389 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());
\r
398 * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983
\r
399 * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512
\r
404 * @throws IOException
\r
405 * @throws UnknownFileFormatException
\r
407 public static List<MultiAnnotatedSequence<DisemblResultAnnot>> readDisembl(
\r
408 final InputStream input) throws IOException,
\r
409 UnknownFileFormatException {
\r
410 Scanner scan = new Scanner(input);
\r
411 scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");
\r
412 if (!scan.hasNext()) {
\r
413 throw new UnknownFileFormatException(
\r
414 "In Disembl score format each seqeunce score is expected to start from the line: "
\r
415 + "'# RESIDUE COILS REM465 HOTLOOPS\\n'."
\r
416 + " No such line was found!");
\r
419 List<MultiAnnotatedSequence<DisemblResultAnnot>> results = new ArrayList<MultiAnnotatedSequence<DisemblResultAnnot>>();
\r
420 int seqCounter = 0;
\r
421 while (scan.hasNext()) {
\r
423 String singleSeq = scan.next();
\r
424 Scanner scansingle = new Scanner(singleSeq);
\r
425 StringBuffer seqbuffer = new StringBuffer();
\r
426 ArrayList<Float> coils = new ArrayList<Float>();
\r
427 ArrayList<Float> rem = new ArrayList<Float>();
\r
428 ArrayList<Float> hotloops = new ArrayList<Float>();
\r
430 MultiAnnotatedSequence<DisemblResultAnnot> disemblRes = new MultiAnnotatedSequence<DisemblResultAnnot>(
\r
431 DisemblResultAnnot.class);
\r
433 while (scansingle.hasNextLine()) {
\r
434 String valueLine = scansingle.nextLine();
\r
435 Scanner values = new Scanner(valueLine);
\r
436 seqbuffer.append(values.next());
\r
437 coils.add(values.nextFloat());
\r
438 rem.add(values.nextFloat());
\r
439 hotloops.add(values.nextFloat());
\r
442 disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils);
\r
443 disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem);
\r
444 disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops);
\r
446 // disemblRes.sequence = seqbuffer.toString();
\r
447 scansingle.close();
\r
448 results.add(disemblRes);
\r
456 * Read AACon result with no alignment files. This method leaves incoming
\r
457 * the InputStream results open!
\r
460 * output file of AAConservation
\r
461 * @return {@link MultiAnnotatedSequence}
\r
463 public static MultiAnnotatedSequence<Method> readResults(InputStream results) {
\r
464 if (results == null) {
\r
465 throw new NullPointerException(
\r
466 "InputStream with results must be provided");
\r
468 MultiAnnotatedSequence<Method> annotations = new MultiAnnotatedSequence<Method>(
\r
470 Scanner sc = new Scanner(results);
\r
471 sc.useDelimiter("#");
\r
472 while (sc.hasNext()) {
\r
473 String line = sc.next();
\r
474 int spacePos = line.indexOf(" ");
\r
475 assert spacePos > 0 : "Space is expected as delimited between method "
\r
476 + "name and values!";
\r
477 String methodLine = line.substring(0, spacePos);
\r
478 Method method = Method.getMethod(methodLine);
\r
479 assert method != null : "Method " + methodLine
\r
480 + " is not recognized! ";
\r
481 Scanner valuesScanner = new Scanner(line.substring(spacePos));
\r
482 ArrayList<Float> values = new ArrayList<Float>();
\r
483 while (valuesScanner.hasNextDouble()) {
\r
484 Double value = valuesScanner.nextDouble();
\r
485 values.add(value.floatValue());
\r
487 annotations.addAnnotation(method, values);
\r
489 return annotations;
\r