/* * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin * Jalview Web Services version: 2.0 This library is free software; you can * redistribute it and/or modify it under the terms of the Apache License * version 2 as published by the Apache Software Foundation This library is * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the * license is in apache_license.txt. It is also available here: see: * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived * work distributed in source code form must include this copyright and license * notice. */ package compbio.data.sequence; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.List; import java.util.Scanner; import java.util.logging.Level; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Utility class for operations on sequences * * @author Petr Troshin * @version 1.0 */ public final class SequenceUtil { /** * A whitespace character: [\t\n\x0B\f\r] */ public static final Pattern WHITE_SPACE = Pattern.compile("\\s"); /** * A digit */ public static final Pattern DIGIT = Pattern.compile("\\d"); /** * Non word */ public static final Pattern NONWORD = Pattern.compile("\\W"); /** * Valid Amino acids */ public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE); /** * inversion of AA pattern */ public static final Pattern NON_AA = Pattern.compile( "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE); /** * Same as AA pattern but with two additional letters - XU */ public static final Pattern AMBIGUOUS_AA = Pattern.compile( "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE); /** * Nucleotides a, t, g, c, u */ public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+", Pattern.CASE_INSENSITIVE); /** * Ambiguous nucleotide */ public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile( "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC /** * Non nucleotide */ public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+", Pattern.CASE_INSENSITIVE); private SequenceUtil() { } // utility class, no instantiation /* * public static void write_PirSeq(OutputStream os, FastaSequence seq) * throws IOException { BufferedWriter pir_out = new BufferedWriter(new * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() + * SysPrefs.newlinechar); pir_out.write(seq.getSequence() + * SysPrefs.newlinechar); pir_out.close(); } public static void * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException { * BufferedWriter fasta_out = new BufferedWriter( new * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() + * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() + * SysPrefs.newlinechar); fasta_out.close(); } */ /** * @return true is the sequence contains only letters a,c, t, g, u */ public static boolean isNucleotideSequence(final FastaSequence s) { return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence()); } /** * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one * (!) - B char */ public static boolean isNonAmbNucleotideSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) { return false; /* * System.out.format("I found the text starting at " + * "index %d and ending at index %d.%n", nonDNAmatcher .start(), * nonDNAmatcher.end()); */ } final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence); return DNAmatcher.find(); } /** * Removes all whitespace chars in the sequence string * * @param sequence * @return cleaned up sequence */ public static String cleanSequence(String sequence) { assert sequence != null; final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence); sequence = m.replaceAll("").toUpperCase(); return sequence; } /** * Removes all special characters and digits as well as whitespace chars * from the sequence * * @param sequence * @return cleaned up sequence */ public static String deepCleanSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll(""); sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll(""); final Pattern othernonSeqChars = Pattern.compile("[_-]+"); sequence = othernonSeqChars.matcher(sequence).replaceAll(""); return sequence; } /** * @param sequence * @return true is the sequence is a protein sequence, false overwise */ public static boolean isProteinSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { return false; } if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_AA.matcher(sequence).find()) { return false; } final Matcher protmatcher = SequenceUtil.AA.matcher(sequence); return protmatcher.find(); } /** * Check whether the sequence confirms to amboguous protein sequence * * @param sequence * @return return true only if the sequence if ambiguous protein sequence * Return false otherwise. e.g. if the sequence is non-ambiguous * protein or DNA */ public static boolean isAmbiguosProtein(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { return false; } if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_AA.matcher(sequence).find()) { return false; } if (SequenceUtil.AA.matcher(sequence).find()) { return false; } final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence); return amb_prot.find(); } /** * Writes list of FastaSequeces into the outstream formatting the sequence * so that it contains width chars on each line * * @param outstream * @param sequences * @param width * - the maximum number of characters to write in one line * @throws IOException */ public static void writeFasta(final OutputStream outstream, final List sequences, final int width) throws IOException { writeFastaKeepTheStream(outstream, sequences, width); outstream.close(); } public static void writeFastaKeepTheStream(final OutputStream outstream, final List sequences, final int width) throws IOException { final OutputStreamWriter writer = new OutputStreamWriter(outstream); final BufferedWriter fastawriter = new BufferedWriter(writer); for (final FastaSequence fs : sequences) { fastawriter.write(">" + fs.getId() + "\n"); fastawriter.write(fs.getFormatedSequence(width)); fastawriter.write("\n"); } fastawriter.flush(); writer.flush(); } /** * Reads fasta sequences from inStream into the list of FastaSequence * objects * * @param inStream * from * @return list of FastaSequence objects * @throws IOException */ public static List readFasta(final InputStream inStream) throws IOException { final List seqs = new ArrayList(); final BufferedReader infasta = new BufferedReader( new InputStreamReader(inStream, "UTF8"), 16000); final Pattern pattern = Pattern.compile("//s+"); String line; String sname = "", seqstr = null; do { line = infasta.readLine(); if ((line == null) || line.startsWith(">")) { if (seqstr != null) { seqs.add(new FastaSequence(sname.substring(1), seqstr)); } sname = line; // remove > seqstr = ""; } else { final String subseq = pattern.matcher(line).replaceAll(""); seqstr += subseq; } } while (line != null); infasta.close(); return seqs; } /** * Writes FastaSequence in the file, each sequence will take one line only * * @param os * @param sequences * @throws IOException */ public static void writeFasta(final OutputStream os, final List sequences) throws IOException { final OutputStreamWriter outWriter = new OutputStreamWriter(os); final BufferedWriter fasta_out = new BufferedWriter(outWriter); for (final FastaSequence fs : sequences) { fasta_out.write(fs.getOnelineFasta()); } fasta_out.close(); outWriter.close(); } public static List readJRonn(final File result) throws IOException, UnknownFileFormatException { InputStream input = new FileInputStream(result); List sequences = readJRonn(input); input.close(); return sequences; } /** * Reader for JRonn horizontal file format >Foobar M G D T T A G 0.48 0.42 * 0.42 0.48 0.52 0.53 0.54 All values are tab delimited * * @param inStream * @return * @throws IOException * @throws UnknownFileFormatException */ public static List readJRonn(final InputStream inStream) throws IOException, UnknownFileFormatException { final List seqs = new ArrayList(); final BufferedReader infasta = new BufferedReader( new InputStreamReader(inStream, "UTF8"), 16000); String line; String sname = ""; do { line = infasta.readLine(); if (line == null || line.isEmpty()) { // skip empty lines continue; } if (line.startsWith(">")) { // read name sname = line.trim().substring(1); // read sequence line line = infasta.readLine(); final String sequence = line.replace("\t", ""); // read annotation line line = infasta.readLine(); String[] annotValues = line.split("\t"); float[] annotation = convertToNumber(annotValues); if (annotation.length != sequence.length()) { throw new UnknownFileFormatException( "File does not look like Jronn horizontally formatted output file!\n" + JRONN_WRONG_FORMAT_MESSAGE); } seqs.add(new AnnotatedSequence(sname, sequence, annotation)); } } while (line != null); infasta.close(); return seqs; } private static float[] convertToNumber(String[] annotValues) throws UnknownFileFormatException { float[] annotation = new float[annotValues.length]; try { for (int i = 0; i < annotation.length; i++) { annotation[i] = Float.parseFloat(annotValues[i]); } } catch (NumberFormatException e) { throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e.getCause()); } return annotation; } private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n" + ">sequence_name\n " + "M V S\n" + "0.43 0.22 0.65\n" + "Where first line is the sequence name,\n" + "second line is the tab delimited sequence,\n" + "third line contains tab delimited disorder prediction values.\n" + "No lines are allowed between these three. Additionally, the number of " + "sequence residues must be equal to the number of the disorder values."; /** * Closes the Closable and logs the exception if any * * @param log * @param stream */ public final static void closeSilently(java.util.logging.Logger log, Closeable stream) { if (stream != null) { try { stream.close(); } catch (IOException e) { log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause()); } } } /** * * TODO complete! * * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983 * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 * 0.37094 * * @param input * @return * @throws IOException * @throws UnknownFileFormatException */ public static List> readDisembl( final InputStream input) throws IOException, UnknownFileFormatException { Scanner scan = new Scanner(input); scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n"); if (!scan.hasNext()) { throw new UnknownFileFormatException( "In Disembl score format each seqeunce score is expected to start from the line: " + "'# RESIDUE COILS REM465 HOTLOOPS\\n'." + " No such line was found!"); } List> results = new ArrayList>(); int seqCounter = 0; while (scan.hasNext()) { seqCounter++; String singleSeq = scan.next(); Scanner scansingle = new Scanner(singleSeq); StringBuffer seqbuffer = new StringBuffer(); List coils = new ArrayList(); List rem = new ArrayList(); List hotloops = new ArrayList(); MultiAnnotatedSequence disemblRes = new MultiAnnotatedSequence( DisemblResultAnnot.class); while (scansingle.hasNextLine()) { String valueLine = scansingle.nextLine(); Scanner values = new Scanner(valueLine); seqbuffer.append(values.next()); coils.add(values.nextFloat()); rem.add(values.nextFloat()); hotloops.add(values.nextFloat()); values.close(); } disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils); disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem); disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops); // TODO // disemblRes.sequence = seqbuffer.toString(); scansingle.close(); results.add(disemblRes); } input.close(); return results; } }