/* * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin * Jalview Web Services version: 2.0 This library is free software; you can * redistribute it and/or modify it under the terms of the Apache License * version 2 as published by the Apache Software Foundation This library is * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the * license is in apache_license.txt. It is also available here: see: * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived * work distributed in source code form must include this copyright and license * notice. */ package compbio.data.sequence; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.Closeable; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.Set; import java.util.logging.Level; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Utility class for operations on sequences * * @author Petr Troshin * @version 1.0 */ public final class SequenceUtil { /** * A whitespace character: [\t\n\x0B\f\r] */ public static final Pattern WHITE_SPACE = Pattern.compile("\\s"); /** * A digit */ public static final Pattern DIGIT = Pattern.compile("\\d"); /** * Non word */ public static final Pattern NONWORD = Pattern.compile("\\W"); /** * Valid Amino acids */ public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE); /** * inversion of AA pattern */ public static final Pattern NON_AA = Pattern.compile( "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE); /** * Same as AA pattern but with two additional letters - XU */ public static final Pattern AMBIGUOUS_AA = Pattern.compile( "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE); /** * Nucleotides a, t, g, c, u */ public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+", Pattern.CASE_INSENSITIVE); /** * Ambiguous nucleotide */ public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile( "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC /** * Non nucleotide */ public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+", Pattern.CASE_INSENSITIVE); private SequenceUtil() { } // utility class, no instantiation /* * public static void write_PirSeq(OutputStream os, FastaSequence seq) * throws IOException { BufferedWriter pir_out = new BufferedWriter(new * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() + * SysPrefs.newlinechar); pir_out.write(seq.getSequence() + * SysPrefs.newlinechar); pir_out.close(); } public static void * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException { * BufferedWriter fasta_out = new BufferedWriter( new * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() + * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() + * SysPrefs.newlinechar); fasta_out.close(); } */ /** * @return true is the sequence contains only letters a,c, t, g, u */ public static boolean isNucleotideSequence(final FastaSequence s) { return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence()); } /** * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one * (!) - B char */ public static boolean isNonAmbNucleotideSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) { return false; /* * System.out.format("I found the text starting at " + * "index %d and ending at index %d.%n", nonDNAmatcher .start(), * nonDNAmatcher.end()); */ } final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence); return DNAmatcher.find(); } /** * Removes all whitespace chars in the sequence string * * @param sequence * @return cleaned up sequence */ public static String cleanSequence(String sequence) { assert sequence != null; final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence); sequence = m.replaceAll("").toUpperCase(); return sequence; } /** * Removes all special characters and digits as well as whitespace chars * from the sequence * * @param sequence * @return cleaned up sequence */ public static String deepCleanSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll(""); sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll(""); final Pattern othernonSeqChars = Pattern.compile("[_-]+"); sequence = othernonSeqChars.matcher(sequence).replaceAll(""); return sequence; } /** * @param sequence * @return true is the sequence is a protein sequence, false overwise */ public static boolean isProteinSequence(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { return false; } if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_AA.matcher(sequence).find()) { return false; } final Matcher protmatcher = SequenceUtil.AA.matcher(sequence); return protmatcher.find(); } /** * Check whether the sequence confirms to amboguous protein sequence * * @param sequence * @return return true only if the sequence if ambiguous protein sequence * Return false otherwise. e.g. if the sequence is non-ambiguous * protein or DNA */ public static boolean isAmbiguosProtein(String sequence) { sequence = SequenceUtil.cleanSequence(sequence); if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) { return false; } if (SequenceUtil.DIGIT.matcher(sequence).find()) { return false; } if (SequenceUtil.NON_AA.matcher(sequence).find()) { return false; } if (SequenceUtil.AA.matcher(sequence).find()) { return false; } final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence); return amb_prot.find(); } /** * Writes list of FastaSequeces into the outstream formatting the sequence * so that it contains width chars on each line * * @param outstream * @param sequences * @param width * - the maximum number of characters to write in one line * @throws IOException */ public static void writeFasta(final OutputStream outstream, final List sequences, final int width) throws IOException { writeFastaKeepTheStream(outstream, sequences, width); outstream.close(); } public static void writeFastaKeepTheStream(final OutputStream outstream, final List sequences, final int width) throws IOException { final OutputStreamWriter writer = new OutputStreamWriter(outstream); final BufferedWriter fastawriter = new BufferedWriter(writer); for (final FastaSequence fs : sequences) { fastawriter.write(">" + fs.getId() + "\n"); fastawriter.write(fs.getFormatedSequence(width)); fastawriter.write("\n"); } fastawriter.flush(); writer.flush(); } /** * Reads fasta sequences from inStream into the list of FastaSequence * objects * * @param inStream * from * @return list of FastaSequence objects * @throws IOException */ public static List readFasta(final InputStream inStream) throws IOException { final List seqs = new ArrayList(); final BufferedReader infasta = new BufferedReader( new InputStreamReader(inStream, "UTF8"), 16000); final Pattern pattern = Pattern.compile("//s+"); String line; String sname = "", seqstr = null; do { line = infasta.readLine(); if ((line == null) || line.startsWith(">")) { if (seqstr != null) { seqs.add(new FastaSequence(sname.substring(1), seqstr)); } sname = line; // remove > seqstr = ""; } else { final String subseq = pattern.matcher(line).replaceAll(""); seqstr += subseq; } } while (line != null); infasta.close(); return seqs; } /** * Writes FastaSequence in the file, each sequence will take one line only * * @param os * @param sequences * @throws IOException */ public static void writeFasta(final OutputStream os, final List sequences) throws IOException { final OutputStreamWriter outWriter = new OutputStreamWriter(os); final BufferedWriter fasta_out = new BufferedWriter(outWriter); for (final FastaSequence fs : sequences) { fasta_out.write(fs.getOnelineFasta()); } fasta_out.close(); outWriter.close(); } public static Map readJRonn(final File result) throws IOException, UnknownFileFormatException { InputStream input = new FileInputStream(result); Map sequences = readJRonn(input); input.close(); return sequences; } /** * Reader for JRonn horizontal file format * *
	 * >Foobar M G D T T A G 0.48 0.42
	 * 0.42 0.48 0.52 0.53 0.54
	 * 
	 * 
	 * Where all values are tab delimited
	 * 
	 * @param inStream
	 *            the InputStream connected to the JRonn output file
	 * @return List of {@link AnnotatedSequence} objects
	 * @throws IOException
	 *             is thrown if the inStream has problems accessing the data
	 * @throws UnknownFileFormatException
	 *             is thrown if the inStream represents an unknown source of
	 * data, i.e. not a JRonn output
	 */
	public static Map readJRonn(final InputStream inStream)
			throws IOException, UnknownFileFormatException {
		final Map seqs = new HashMap();

		final BufferedReader infasta = new BufferedReader(
				new InputStreamReader(inStream, "UTF8"), 16000);

		String line;
		String sname = "";
		do {
			line = infasta.readLine();
			if (line == null || line.isEmpty()) {
				// skip empty lines
				continue;
			}
			if (line.startsWith(">")) {
				// read name
				sname = line.trim().substring(1);
				// read sequence line
				line = infasta.readLine();
				final String sequence = line.replace("\t", "");
				// read annotation line
				line = infasta.readLine();
				String[] annotValues = line.split("\t");
				float[] annotation = convertToNumber(annotValues);
				if (annotation.length != sequence.length()) {
					throw new UnknownFileFormatException(
							"File does not look like Jronn horizontally formatted output file!\n"
									+ JRONN_WRONG_FORMAT_MESSAGE);
				}
				seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));
			}
		} while (line != null);

		infasta.close();
		return seqs;
	}
	private static float[] convertToNumber(String[] annotValues)
			throws UnknownFileFormatException {
		float[] annotation = new float[annotValues.length];
		try {
			for (int i = 0; i < annotation.length; i++) {
				annotation[i] = Float.parseFloat(annotValues[i]);
			}
		} catch (NumberFormatException e) {
			throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,
					e.getCause());
		}
		return annotation;
	}

	private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"
			+ ">sequence_name\n "
			+ "M	V	S\n"
			+ "0.43	0.22	0.65\n"
			+ "Where first line is the sequence name,\n"
			+ "second line is the tab delimited sequence,\n"
			+ "third line contains tab delimited disorder prediction values.\n"
			+ "No lines are allowed between these three. Additionally, the number of  "
			+ "sequence residues must be equal to the number of the disorder values.";

	/**
	 * Closes the Closable and logs the exception if any
	 * 
	 * @param log
	 * @param stream
	 */
	public final static void closeSilently(java.util.logging.Logger log,
			Closeable stream) {
		if (stream != null) {
			try {
				stream.close();
			} catch (IOException e) {
				log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());
			}
		}
	}

	/**
	 * 
	 * TODO complete!
	 * 
	 * RESIDUE COILS REM465 HOTLOOPS
	 * 
	 * M 0.86010 0.88512 0.37094
	 * 
	 * T 0.79983 0.85864 0.44331 ....
	 * 
	 * RESIDUE COILS REM465 HOTLOOPS
	 * 
	 * M 0.86010 0.88512 0.37094
	 * 
	 * 
	 * @param input
	 * @return
	 * @throws IOException
	 * @throws UnknownFileFormatException
	 */
	static Map> readDisembl(final InputStream input)
			throws IOException, UnknownFileFormatException {
		Scanner scan = new Scanner(input);
		scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");
		if (!scan.hasNext()) {
			throw new UnknownFileFormatException(
					"In Disembl score format each seqeunce score is expected to start from the line: "
							+ "'# RESIDUE COILS REM465 HOTLOOPS\\n'."
							+ " No such line was found!");
		}

		Map> results = new HashMap>();
		int seqCounter = 0;
		while (scan.hasNext()) {
			seqCounter++;
			String singleSeq = scan.next();
			Scanner scansingle = new Scanner(singleSeq);
			StringBuffer seqbuffer = new StringBuffer();
			ArrayList coils = new ArrayList();
			ArrayList rem = new ArrayList();
			ArrayList hotloops = new ArrayList();
			FastaSequence fs = new FastaSequence(Integer.toString(seqCounter),
					singleSeq);
			while (scansingle.hasNextLine()) {
				String valueLine = scansingle.nextLine();
				Scanner values = new Scanner(valueLine);
				seqbuffer.append(values.next());
				coils.add(values.nextFloat());
				rem.add(values.nextFloat());
				hotloops.add(values.nextFloat());
				values.close();
			}
			Set scores = new HashSet();
			scores.add(new Score(DisemblResultAnnot.COILS, coils));
			scores.add(new Score(DisemblResultAnnot.HOTLOOPS, hotloops));
			scores.add(new Score(DisemblResultAnnot.REM465, rem));
			results.put(fs, scores);

			scansingle.close();
		}

		input.close();
		return results;
	}
	/**
	 * Read AACon result with no alignment files. This method leaves incoming
	 * the InputStream results open!
	 * 
	 * @param results
	 *            output file of AAConservation
	 * @return Map with keys {@link ConservationMethod} -> float[]
	 */
	public static HashSet readAAConResults(InputStream results) {
		if (results == null) {
			throw new NullPointerException(
					"InputStream with results must be provided");
		}
		HashSet annotations = new HashSet();
		Scanner sc = new Scanner(results);
		sc.useDelimiter("#");
		while (sc.hasNext()) {
			String line = sc.next();
			int spacePos = line.indexOf(" ");
			assert spacePos > 0 : "Space is expected as delimited between method "
					+ "name and values!";
			String methodLine = line.substring(0, spacePos);
			ConservationMethod method = ConservationMethod
					.getMethod(methodLine);
			assert method != null : "Method " + methodLine
					+ " is not recognized! ";
			Scanner valuesScanner = new Scanner(line.substring(spacePos));
			ArrayList values = new ArrayList();
			while (valuesScanner.hasNextDouble()) {
				Double value = valuesScanner.nextDouble();
				values.add(value.floatValue());
			}
			annotations.add(new Score(method, values));
		}
		return annotations;
	}

	/**
	 * Reads and parses Fasta or Clustal formatted file into a list of
	 * FastaSequence objects
	 * 
	 * @param inFilePath
	 *            the path to the input file
	 * @throws IOException
	 *             if the file denoted by inFilePath cannot be read
	 * @throws UnknownFileFormatException
	 *             if the inFilePath points to the file which format cannot be
	 *             recognised
	 * @return the List of FastaSequence objects
	 * 
	 */
	public static List openInputStream(String inFilePath)
			throws IOException, UnknownFileFormatException {

		// This stream gets closed in isValidClustalFile method
		InputStream inStrForValidation = new FileInputStream(inFilePath);
		// This stream is closed in the calling methods
		InputStream inStr = new FileInputStream(inFilePath);
		List fastaSeqs = null;
		if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {
			Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);
			// alignment cannot be null see
			// ClustalAlignmentUtil.readClustalFile(inStr);
			fastaSeqs = al.getSequences();
		} else {
			fastaSeqs = SequenceUtil.readFasta(inStr);
		}
		return fastaSeqs;
	}

}