X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=datamodel%2Fcompbio%2Fdata%2Fsequence%2FSequenceUtil.java;h=6e20988d0b4c1bdd2ab4751e672402ac2779e7d9;hb=91dc99b9b18e403c97b1c4e0ead8f754991714a5;hp=c9955853b8eae4cb5f0e79e1ccffb73f53845542;hpb=212bbd43c19f645cccef34a608dc001fb694833c;p=jabaws.git diff --git a/datamodel/compbio/data/sequence/SequenceUtil.java b/datamodel/compbio/data/sequence/SequenceUtil.java index c995585..6e20988 100644 --- a/datamodel/compbio/data/sequence/SequenceUtil.java +++ b/datamodel/compbio/data/sequence/SequenceUtil.java @@ -1,15 +1,19 @@ -/* - * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin - * Jalview Web Services version: 2.0 This library is free software; you can - * redistribute it and/or modify it under the terms of the Apache License - * version 2 as published by the Apache Software Foundation This library is - * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A - * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the - * license is in apache_license.txt. It is also available here: see: - * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived - * work distributed in source code form must include this copyright and license - * notice. +/* Copyright (c) 2011 Peter Troshin + * + * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0 + * + * This library is free software; you can redistribute it and/or modify it under the terms of the + * Apache License version 2 as published by the Apache Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache + * License for more details. + * + * A copy of the license is in apache_license.txt. It is also available here: + * @see: http://www.apache.org/licenses/LICENSE-2.0.txt + * + * Any republication or derived work distributed in source code form + * must include this copyright and license notice. */ package compbio.data.sequence; @@ -25,20 +29,25 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Scanner; +import java.util.Set; +import java.util.TreeSet; import java.util.logging.Level; import java.util.regex.Matcher; import java.util.regex.Pattern; -import compbio.conservation.Method; +import compbio.util.Util; /** * Utility class for operations on sequences * - * @author Petr Troshin - * @version 1.0 + * @author Peter Troshin + * @since 1.0 + * @version 2.0 June 2011 */ public final class SequenceUtil { @@ -95,19 +104,6 @@ public final class SequenceUtil { private SequenceUtil() { } // utility class, no instantiation - /* - * public static void write_PirSeq(OutputStream os, FastaSequence seq) - * throws IOException { BufferedWriter pir_out = new BufferedWriter(new - * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() + - * SysPrefs.newlinechar); pir_out.write(seq.getSequence() + - * SysPrefs.newlinechar); pir_out.close(); } public static void - * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException { - * BufferedWriter fasta_out = new BufferedWriter( new - * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() + - * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() + - * SysPrefs.newlinechar); fasta_out.close(); } - */ - /** * @return true is the sequence contains only letters a,c, t, g, u */ @@ -166,6 +162,17 @@ public final class SequenceUtil { } /** + * Remove all non AA chars from the sequence + * + * @param sequence + * the sequence to clean + * @return cleaned sequence + */ + public static String cleanProteinSequence(String sequence) { + return SequenceUtil.NON_AA.matcher(sequence).replaceAll(""); + } + + /** * @param sequence * @return true is the sequence is a protein sequence, false overwise */ @@ -253,28 +260,11 @@ public final class SequenceUtil { public static List readFasta(final InputStream inStream) throws IOException { final List seqs = new ArrayList(); - - final BufferedReader infasta = new BufferedReader( - new InputStreamReader(inStream, "UTF8"), 16000); - final Pattern pattern = Pattern.compile("//s+"); - - String line; - String sname = "", seqstr = null; - do { - line = infasta.readLine(); - if ((line == null) || line.startsWith(">")) { - if (seqstr != null) { - seqs.add(new FastaSequence(sname.substring(1), seqstr)); - } - sname = line; // remove > - seqstr = ""; - } else { - final String subseq = pattern.matcher(line).replaceAll(""); - seqstr += subseq; - } - } while (line != null); - - infasta.close(); + FastaReader reader = new FastaReader(inStream); + while (reader.hasNext()) { + seqs.add(reader.next()); + } + inStream.close(); return seqs; } @@ -296,26 +286,157 @@ public final class SequenceUtil { outWriter.close(); } - public static List readJRonn(final File result) + /** + * Read IUPred output + * + * @param result + * @return Map key->sequence name, value->Score + * @throws IOException + * @throws UnknownFileFormatException + */ + public static Map readIUPred(final File result) throws IOException, UnknownFileFormatException { InputStream input = new FileInputStream(result); - List sequences = readJRonn(input); + Map sequences = readIUPred(input, + IUPredResult.getType(result)); input.close(); return sequences; } + // Check the type of the file e.g. long| short or domain + // and read /** - * Reader for JRonn horizontal file format >Foobar M G D T T A G 0.48 0.42 - * 0.42 0.48 0.52 0.53 0.54 All values are tab delimited + * ## Long Disorder + * + * # P53_HUMAN + * + * 1 M 0.9943 + * + * 2 E 0.9917 + * + * 3 E 0.9879 + * + * (every line) + * + * @throws IOException + * @throws UnknownFileFormatException + * + * + */ + private static Map readIUPred(InputStream input, + IUPredResult type) throws IOException, UnknownFileFormatException { + + Score score = null; + final Map seqs = new HashMap(); + Scanner scan = new Scanner(input); + scan.useDelimiter("#"); + while (scan.hasNext()) { + String nextEntry = scan.next(); + Scanner entry = new Scanner(nextEntry); + String name = entry.nextLine().trim(); + // inside entry: + if (IUPredResult.Glob == type) { + // parse domains + TreeSet ranges = parseIUPredDomains(entry); + score = new Score(type, ranges); + } else { + // parse short | long + float[] scores = parseIUPredScores(entry); + score = new Score(type, scores); + } + entry.close(); + seqs.put(name, score); + } + + scan.close(); + return seqs; + } + + /** + * # P53_HUMA + * + * Number of globular domains: 2 + * + * globular domain 1. 98 - 269 + * + * globular domain 2. 431 - 482 + * + * >P53_HUMA + * + * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp + * + * @param scan + */ + private static TreeSet parseIUPredDomains(Scanner scan) { + String header = "Number of globular domains:"; + String domainPref = "globular domain"; + TreeSet ranges = new TreeSet(); + String line = scan.nextLine().trim(); + assert line.startsWith(header); + line = line.substring(header.length()).trim(); + int domainNum = Integer.parseInt(line); + if (domainNum == 0) { + return ranges; + } + + for (int i = 0; i < domainNum; i++) { + assert scan.hasNextLine(); + line = scan.nextLine(); + assert line.trim().startsWith(domainPref); + line = line.substring(line.indexOf(".") + 1).trim(); + Range r = new Range(line.split("-")); + ranges.add(r); + } + + return ranges; + } + /* + * 1 M 0.9943 + * + * 2 E 0.9917 + */ + private static float[] parseIUPredScores(Scanner scan) + throws UnknownFileFormatException { + List annotation = new ArrayList(); + while (scan.hasNextLine()) { + String line = scan.nextLine().trim(); + String[] val = line.split("\\s+"); + annotation.add(val[2]); + } + return convertToNumber(annotation + .toArray(new String[annotation.size()])); + } + + public static Map readJRonn(final File result) + throws IOException, UnknownFileFormatException { + InputStream input = new FileInputStream(result); + Map sequences = readJRonn(input); + input.close(); + return sequences; + } + + /** + * Reader for JRonn horizontal file format + * + *
+	 * >Foobar M G D T T A G 0.48 0.42
+	 * 0.42 0.48 0.52 0.53 0.54
+	 * 
+	 * 
+	 * Where all values are tab delimited
 	 * 
 	 * @param inStream
-	 * @return
+	 *            the InputStream connected to the JRonn output file
+	 * @return Map key=sequence name value=Score
 	 * @throws IOException
+	 *             is thrown if the inStream has problems accessing the data
 	 * @throws UnknownFileFormatException
+	 *             is thrown if the inStream represents an unknown source of
+	 * data, i.e. not a JRonn output
 	 */
-	public static List readJRonn(final InputStream inStream)
+	public static Map readJRonn(final InputStream inStream)
 			throws IOException, UnknownFileFormatException {
-		final List seqs = new ArrayList();
+		final Map seqs = new HashMap();
 
 		final BufferedReader infasta = new BufferedReader(
 				new InputStreamReader(inStream, "UTF8"), 16000);
@@ -343,7 +464,7 @@ public final class SequenceUtil {
 							"File does not look like Jronn horizontally formatted output file!\n"
 									+ JRONN_WRONG_FORMAT_MESSAGE);
 				}
-				seqs.add(new AnnotatedSequence(sname, sequence, annotation));
+				seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));
 			}
 		} while (line != null);
 
@@ -394,72 +515,219 @@ public final class SequenceUtil {
 
 	/**
 	 * 
-	 * TODO complete!
+	 > Foobar_dundeefriends
+	 * 
+	 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343
+	 * 
+	 * # REM465 355-368
+	 * 
+	 * # HOTLOOPS 190-204
+	 * 
+	 * # RESIDUE COILS REM465 HOTLOOPS
+	 * 
+	 * M 0.86010 0.88512 0.37094
+	 * 
+	 * T 0.79983 0.85864 0.44331
+	 * 
+	 * >Next Sequence name
 	 * 
-	 * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983
-	 * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512
-	 * 0.37094
 	 * 
 	 * @param input
-	 * @return
+	 *            the InputStream
+	 * @return Map key=sequence name, value=set of score
 	 * @throws IOException
 	 * @throws UnknownFileFormatException
 	 */
-	public static List> readDisembl(
+	public static HashMap> readDisembl(
 			final InputStream input) throws IOException,
 			UnknownFileFormatException {
 		Scanner scan = new Scanner(input);
-		scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");
+		scan.useDelimiter(">");
 		if (!scan.hasNext()) {
 			throw new UnknownFileFormatException(
-					"In Disembl score format each seqeunce score is expected to start from the line: "
-							+ "'# RESIDUE COILS REM465 HOTLOOPS\\n'."
+					"In Disembl score format each sequence score is expected "
+							+ "to start from the line: >Sequence name "
 							+ " No such line was found!");
 		}
 
-		List> results = new ArrayList>();
+		HashMap> results = new HashMap>();
 		int seqCounter = 0;
 		while (scan.hasNext()) {
 			seqCounter++;
 			String singleSeq = scan.next();
 			Scanner scansingle = new Scanner(singleSeq);
+			if (!scansingle.hasNextLine()) {
+				throw new RuntimeException(
+						"The input looks like an incomplete disembl file - cannot parse!");
+			}
+
 			StringBuffer seqbuffer = new StringBuffer();
 			ArrayList coils = new ArrayList();
 			ArrayList rem = new ArrayList();
 			ArrayList hotloops = new ArrayList();
 
-			MultiAnnotatedSequence disemblRes = new MultiAnnotatedSequence(
-					DisemblResultAnnot.class);
-
-			while (scansingle.hasNextLine()) {
-				String valueLine = scansingle.nextLine();
-				Scanner values = new Scanner(valueLine);
-				seqbuffer.append(values.next());
-				coils.add(values.nextFloat());
-				rem.add(values.nextFloat());
-				hotloops.add(values.nextFloat());
-				values.close();
+			String sequenceName = scansingle.nextLine().trim();
+			TreeSet coilsR = parseRanges(DisemblResult.COILS,
+					scansingle.nextLine());
+			TreeSet rem465R = parseRanges(DisemblResult.REM465,
+					scansingle.nextLine());
+			TreeSet loopsR = parseRanges(DisemblResult.HOTLOOPS,
+					scansingle.nextLine());
+
+			String title = scansingle.nextLine();
+			assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";
+
+			while (scansingle.hasNext()) {
+				seqbuffer.append(scansingle.next());
+				coils.add(scansingle.nextFloat());
+				rem.add(scansingle.nextFloat());
+				hotloops.add(scansingle.nextFloat());
 			}
-			disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils);
-			disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem);
-			disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops);
-			// TODO
-			// disemblRes.sequence = seqbuffer.toString();
+			/*
+			 * Also possible FastaSequence fs = new FastaSequence(sequenceName,
+			 * seqbuffer.toString());
+			 */
+			HashSet scores = new HashSet();
+			scores.add(new Score(DisemblResult.COILS, coils, coilsR));
+			scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));
+			scores.add(new Score(DisemblResult.REM465, rem, loopsR));
+			results.put(sequenceName, scores);
+
 			scansingle.close();
-			results.add(disemblRes);
 		}
-
+		scan.close();
 		input.close();
 		return results;
 	}
 
 	/**
+	 * Parsing:
+	 * 
+	 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,
+	 * 350-391, 429-485, 497-506, 539-547
+	 * 
+	 * # REM465 355-368
+	 * 
+	 * # HOTLOOPS 190-204
+	 * 
+	 * @param lines
+	 * @return
+	 */
+	private static TreeSet parseRanges(Enum resultType, String lines) {
+		TreeSet ranges = new TreeSet();
+
+		Scanner scan = new Scanner(lines);
+
+		assert scan.hasNext();
+		String del = scan.next();
+		assert "#".equals(del); // pass delimiter #
+		String type = scan.next(); // pass enum name e.g. COILS
+		assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "
+				+ resultType.toString();
+
+		// beginning of the ranges
+		scan.useDelimiter(",");
+		while (scan.hasNext()) {
+			String range = scan.next();
+			if (!Util.isEmpty(range)) {
+				ranges.add(new Range(range.split("-")));
+			}
+		}
+		return ranges;
+	}
+
+	/**
+	 * 
+	 > Foobar_dundeefriends
+	 * 
+	 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343
+	 * 
+	 * # REM465 355-368
+	 * 
+	 * # HOTLOOPS 190-204
+	 * 
+	 * # RESIDUE COILS REM465 HOTLOOPS
+	 * 
+	 * M 0.86010 0.88512 0.37094
+	 * 
+	 * T 0.79983 0.85864 0.44331
+	 * 
+	 * >Next Sequence name
+	 * 
+	 * 
+	 * @param input
+	 * @return Map key=sequence name, value=set of score
+	 * @throws IOException
+	 * @throws UnknownFileFormatException
+	 */
+	public static HashMap> readGlobPlot(
+			final InputStream input) throws IOException,
+			UnknownFileFormatException {
+		Scanner scan = new Scanner(input);
+		scan.useDelimiter(">");
+		if (!scan.hasNext()) {
+			throw new UnknownFileFormatException(
+					"In GlobPlot score format each sequence score is expected "
+							+ "to start from the line: >Sequence name "
+							+ " No such line was found!");
+		}
+
+		HashMap> results = new HashMap>();
+		int seqCounter = 0;
+		while (scan.hasNext()) {
+			seqCounter++;
+			String singleSeq = scan.next();
+			Scanner scansingle = new Scanner(singleSeq);
+			if (!scansingle.hasNextLine()) {
+				throw new RuntimeException(
+						"The input looks like an incomplete GlobPlot file - cannot parse!");
+			}
+
+			StringBuffer seqbuffer = new StringBuffer();
+			ArrayList dydxScore = new ArrayList();
+			ArrayList rawScore = new ArrayList();
+			ArrayList smoothedScore = new ArrayList();
+
+			String sequenceName = scansingle.nextLine().trim();
+			TreeSet domsR = parseRanges(GlobProtResult.GlobDoms,
+					scansingle.nextLine());
+			TreeSet disorderR = parseRanges(GlobProtResult.Disorder,
+					scansingle.nextLine());
+
+			String title = scansingle.nextLine();
+			assert title.startsWith("# RESIDUE	DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";
+
+			while (scansingle.hasNext()) {
+				seqbuffer.append(scansingle.next());
+				dydxScore.add(scansingle.nextFloat());
+				rawScore.add(scansingle.nextFloat());
+				smoothedScore.add(scansingle.nextFloat());
+			}
+			/*
+			 * Also possible FastaSequence fs = new FastaSequence(sequenceName,
+			 * seqbuffer.toString());
+			 */
+			Set scores = new TreeSet();
+			scores.add(new Score(GlobProtResult.Disorder, disorderR));
+			scores.add(new Score(GlobProtResult.GlobDoms, domsR));
+			scores.add(new Score(GlobProtResult.Dydx, dydxScore));
+			scores.add(new Score(GlobProtResult.RawScore, rawScore));
+			scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));
+			results.put(sequenceName, scores);
+
+			scansingle.close();
+		}
+		scan.close();
+		input.close();
+		return results;
+	}
+	/**
 	 * Read AACon result with no alignment files. This method leaves incoming
-	 * the InputStream results open!
+	 * InputStream open!
 	 * 
 	 * @param results
 	 *            output file of AAConservation
-	 * @return Map with keys {@link Method} -> float[]
+	 * @return Map with keys {@link ConservationMethod} -> float[]
 	 */
 	public static HashSet readAAConResults(InputStream results) {
 		if (results == null) {
@@ -475,7 +743,8 @@ public final class SequenceUtil {
 			assert spacePos > 0 : "Space is expected as delimited between method "
 					+ "name and values!";
 			String methodLine = line.substring(0, spacePos);
-			Method method = Method.getMethod(methodLine);
+			ConservationMethod method = ConservationMethod
+					.getMethod(methodLine);
 			assert method != null : "Method " + methodLine
 					+ " is not recognized! ";
 			Scanner valuesScanner = new Scanner(line.substring(spacePos));
@@ -489,4 +758,87 @@ public final class SequenceUtil {
 		return annotations;
 	}
 
+	/**
+	 * Reads and parses Fasta or Clustal formatted file into a list of
+	 * FastaSequence objects
+	 * 
+	 * @param inFilePath
+	 *            the path to the input file
+	 * @throws IOException
+	 *             if the file denoted by inFilePath cannot be read
+	 * @throws UnknownFileFormatException
+	 *             if the inFilePath points to the file which format cannot be
+	 *             recognised
+	 * @return the List of FastaSequence objects
+	 * 
+	 */
+	public static List openInputStream(String inFilePath)
+			throws IOException, UnknownFileFormatException {
+
+		// This stream gets closed in isValidClustalFile method
+		InputStream inStrForValidation = new FileInputStream(inFilePath);
+		// This stream is closed in the calling methods
+		InputStream inStr = new FileInputStream(inFilePath);
+		List fastaSeqs = null;
+		if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {
+			Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);
+			// alignment cannot be null see
+			// ClustalAlignmentUtil.readClustalFile(inStr);
+			fastaSeqs = al.getSequences();
+		} else {
+			fastaSeqs = SequenceUtil.readFasta(inStr);
+		}
+		return fastaSeqs;
+	}
+
+}
+
+enum DisemblResult {
+	/** These contains ranges and scores */
+	COILS, REM465, HOTLOOPS
 }
+enum GlobProtResult {
+	/** This a range with no scores */
+	GlobDoms,
+	/** This a range with no scores */
+	Disorder,
+	/** This a score with no range */
+	Dydx,
+	/** This a score with no range */
+	SmoothedScore,
+	/** This a score with no range */
+	RawScore
+}
+
+enum IUPredResult {
+	/**
+	 * Short disorder
+	 */
+	Short,
+	/**
+	 * Long disorder
+	 */
+	Long,
+	/**
+	 * Globular domains
+	 */
+	Glob;
+
+	static IUPredResult getType(File file) {
+		assert file != null;
+		String name = file.getName();
+		if (name.endsWith(Long.toString().toLowerCase())) {
+			return Long;
+		}
+		if (name.endsWith(Short.toString().toLowerCase())) {
+			return Short;
+		}
+		if (name.endsWith(Glob.toString().toLowerCase())) {
+			return Glob;
+		}
+		throw new AssertionError(
+				"IUPred result file type cannot be recognised! "
+						+ "\nFile must ends with one of [glob, long or short]"
+						+ "\n but given file name was: " + file.getName());
+	}
+}
\ No newline at end of file