X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=datamodel%2Fcompbio%2Fdata%2Fsequence%2FSequenceUtil.java;h=14dedf4214b4d8dfe4f5e0264b0a31b59ae949d2;hb=94faa6031ae916e4f7f7e61494e36278adcab7e0;hp=a636c3a0a80b7190587d8821b27a2bc7f16694bd;hpb=4d6083e0cf39c4989e08bdaf9384e9d44c0da607;p=jabaws.git

diff --git a/datamodel/compbio/data/sequence/SequenceUtil.java b/datamodel/compbio/data/sequence/SequenceUtil.java
index a636c3a..14dedf4 100644
--- a/datamodel/compbio/data/sequence/SequenceUtil.java
+++ b/datamodel/compbio/data/sequence/SequenceUtil.java
@@ -1,15 +1,19 @@
-/*
- * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin
- * Jalview Web Services version: 2.0 This library is free software; you can
- * redistribute it and/or modify it under the terms of the Apache License
- * version 2 as published by the Apache Software Foundation This library is
- * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the
- * license is in apache_license.txt. It is also available here: see:
- * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived
- * work distributed in source code form must include this copyright and license
- * notice.
+/* Copyright (c) 2011 Peter Troshin
+ *  
+ *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0     
+ * 
+ *  This library is free software; you can redistribute it and/or modify it under the terms of the
+ *  Apache License version 2 as published by the Apache Software Foundation
+ * 
+ *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
+ *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache 
+ *  License for more details.
+ * 
+ *  A copy of the license is in apache_license.txt. It is also available here:
+ * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
+ * 
+ * Any republication or derived work distributed in source code form
+ * must include this copyright and license notice.
  */
 
 package compbio.data.sequence;
@@ -31,15 +35,19 @@ import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;
+import java.util.TreeSet;
 import java.util.logging.Level;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import compbio.util.Util;
+
 /**
  * Utility class for operations on sequences
  * 
- * @author Petr Troshin
- * @version 1.0
+ * @author Peter Troshin
+ * @since 1.0
+ * @version 2.0 June 2011
  */
 public final class SequenceUtil {
 
@@ -96,19 +104,6 @@ public final class SequenceUtil {
 	private SequenceUtil() {
 	} // utility class, no instantiation
 
-	/*
-	 * public static void write_PirSeq(OutputStream os, FastaSequence seq)
-	 * throws IOException { BufferedWriter pir_out = new BufferedWriter(new
-	 * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +
-	 * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +
-	 * SysPrefs.newlinechar); pir_out.close(); } public static void
-	 * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {
-	 * BufferedWriter fasta_out = new BufferedWriter( new
-	 * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +
-	 * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +
-	 * SysPrefs.newlinechar); fasta_out.close(); }
-	 */
-
 	/**
 	 * @return true is the sequence contains only letters a,c, t, g, u
 	 */
@@ -167,6 +162,17 @@ public final class SequenceUtil {
 	}
 
 	/**
+	 * Remove all non AA chars from the sequence
+	 * 
+	 * @param sequence
+	 *            the sequence to clean
+	 * @return cleaned sequence
+	 */
+	public static String cleanProteinSequence(String sequence) {
+		return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");
+	}
+
+	/**
 	 * @param sequence
 	 * @return true is the sequence is a protein sequence, false overwise
 	 */
@@ -254,28 +260,11 @@ public final class SequenceUtil {
 	public static List<FastaSequence> readFasta(final InputStream inStream)
 			throws IOException {
 		final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
-
-		final BufferedReader infasta = new BufferedReader(
-				new InputStreamReader(inStream, "UTF8"), 16000);
-		final Pattern pattern = Pattern.compile("//s+");
-
-		String line;
-		String sname = "", seqstr = null;
-		do {
-			line = infasta.readLine();
-			if ((line == null) || line.startsWith(">")) {
-				if (seqstr != null) {
-					seqs.add(new FastaSequence(sname.substring(1), seqstr));
-				}
-				sname = line; // remove >
-				seqstr = "";
-			} else {
-				final String subseq = pattern.matcher(line).replaceAll("");
-				seqstr += subseq;
-			}
-		} while (line != null);
-
-		infasta.close();
+		FastaReader reader = new FastaReader(inStream);
+		while (reader.hasNext()) {
+			seqs.add(reader.next());
+		}
+		inStream.close();
 		return seqs;
 	}
 
@@ -297,6 +286,127 @@ public final class SequenceUtil {
 		outWriter.close();
 	}
 
+	/**
+	 * Read IUPred output
+	 * 
+	 * @param result
+	 * @return Map key->sequence name, value->Score
+	 * @throws IOException
+	 * @throws UnknownFileFormatException
+	 */
+	public static Map<String, Score> readIUPred(final File result)
+			throws IOException, UnknownFileFormatException {
+		InputStream input = new FileInputStream(result);
+		Map<String, Score> sequences = readIUPred(input,
+				IUPredResult.getType(result));
+		input.close();
+		return sequences;
+	}
+
+	// Check the type of the file e.g. long| short or domain
+	// and read
+	/**
+	 * ## Long Disorder
+	 * 
+	 * # P53_HUMAN
+	 * 
+	 * 1 M 0.9943
+	 * 
+	 * 2 E 0.9917
+	 * 
+	 * 3 E 0.9879
+	 * 
+	 * (every line)
+	 * 
+	 * @throws IOException
+	 * @throws UnknownFileFormatException
+	 * 
+	 * 
+	 */
+	private static Map<String, Score> readIUPred(InputStream input,
+			IUPredResult type) throws IOException, UnknownFileFormatException {
+
+		Score score = null;
+		final Map<String, Score> seqs = new HashMap<String, Score>();
+		Scanner scan = new Scanner(input);
+		scan.useDelimiter("#");
+		while (scan.hasNext()) {
+			String nextEntry = scan.next();
+			Scanner entry = new Scanner(nextEntry);
+			String name = entry.nextLine().trim();
+			// inside entry:
+			if (IUPredResult.Glob == type) {
+				// parse domains
+				TreeSet<Range> ranges = parseIUPredDomains(entry);
+				score = new Score(type, ranges);
+			} else {
+				// parse short | long
+				float[] scores = parseIUPredScores(entry);
+				score = new Score(type, scores);
+			}
+			entry.close();
+			seqs.put(name, score);
+		}
+
+		scan.close();
+		return seqs;
+	}
+
+	/**
+	 * # P53_HUMA
+	 * 
+	 * Number of globular domains: 2
+	 * 
+	 * globular domain 1. 98 - 269
+	 * 
+	 * globular domain 2. 431 - 482
+	 * 
+	 * >P53_HUMA
+	 * 
+	 * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp
+	 * 
+	 * @param scan
+	 */
+	private static TreeSet<Range> parseIUPredDomains(Scanner scan) {
+		String header = "Number of globular domains:";
+		String domainPref = "globular domain";
+		TreeSet<Range> ranges = new TreeSet<Range>();
+		String line = scan.nextLine().trim();
+		assert line.startsWith(header);
+		line = line.substring(header.length()).trim();
+		int domainNum = Integer.parseInt(line);
+		if (domainNum == 0) {
+			return ranges;
+		}
+
+		for (int i = 0; i < domainNum; i++) {
+			assert scan.hasNextLine();
+			line = scan.nextLine();
+			assert line.trim().startsWith(domainPref);
+			line = line.substring(line.indexOf(".") + 1).trim();
+			Range r = new Range(line.split("-"));
+			ranges.add(r);
+		}
+
+		return ranges;
+	}
+	/*
+	 * 1 M 0.9943
+	 * 
+	 * 2 E 0.9917
+	 */
+	private static float[] parseIUPredScores(Scanner scan)
+			throws UnknownFileFormatException {
+		List<String> annotation = new ArrayList<String>();
+		while (scan.hasNextLine()) {
+			String line = scan.nextLine().trim();
+			String[] val = line.split("\\s+");
+			annotation.add(val[2]);
+		}
+		return convertToNumber(annotation
+				.toArray(new String[annotation.size()]));
+	}
+
 	public static Map<String, Score> readJRonn(final File result)
 			throws IOException, UnknownFileFormatException {
 		InputStream input = new FileInputStream(result);
@@ -317,7 +427,7 @@ public final class SequenceUtil {
 	 * 
 	 * @param inStream
 	 *            the InputStream connected to the JRonn output file
-	 * @return List of {@link AnnotatedSequence} objects
+	 * @return Map key=sequence name value=Score
 	 * @throws IOException
 	 *             is thrown if the inStream has problems accessing the data
 	 * @throws UnknownFileFormatException
@@ -361,6 +471,7 @@ public final class SequenceUtil {
 		infasta.close();
 		return seqs;
 	}
+
 	private static float[] convertToNumber(String[] annotValues)
 			throws UnknownFileFormatException {
 		float[] annotation = new float[annotValues.length];
@@ -404,88 +515,215 @@ public final class SequenceUtil {
 
 	/**
 	 * 
-	 * TODO complete!
+	 > Foobar_dundeefriends
 	 * 
-	 * >Sequence name
+	 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343
 	 * 
-	 * RESIDUE COILS REM465 HOTLOOPS
+	 * # REM465 355-368
 	 * 
-	 * M 0.86010 0.88512 0.37094
+	 * # HOTLOOPS 190-204
 	 * 
-	 * T 0.79983 0.85864 0.44331 ....
-
-	 * >Next Sequence name 
-	 * RESIDUE COILS REM465 HOTLOOPS
+	 * # RESIDUE COILS REM465 HOTLOOPS
 	 * 
 	 * M 0.86010 0.88512 0.37094
 	 * 
+	 * T 0.79983 0.85864 0.44331
+	 * 
+	 * >Next Sequence name
+	 * 
 	 * 
 	 * @param input
-	 * @return
+	 *            the InputStream
+	 * @return Map key=sequence name, value=set of score
 	 * @throws IOException
 	 * @throws UnknownFileFormatException
 	 */
-	public static Map<FastaSequence, Set<Score>> readDisembl(final InputStream input)
-			throws IOException, UnknownFileFormatException {
+	public static HashMap<String, Set<Score>> readDisembl(
+			final InputStream input) throws IOException,
+			UnknownFileFormatException {
 		Scanner scan = new Scanner(input);
 		scan.useDelimiter(">");
 		if (!scan.hasNext()) {
 			throw new UnknownFileFormatException(
-					"In Disembl score format each sequence score is expected " +
-					"to start from the line: >Sequence name "
+					"In Disembl score format each sequence score is expected "
+							+ "to start from the line: >Sequence name "
 							+ " No such line was found!");
 		}
 
-		Map<FastaSequence, Set<Score>> results = new HashMap<FastaSequence, Set<Score>>();
+		HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();
 		int seqCounter = 0;
 		while (scan.hasNext()) {
 			seqCounter++;
 			String singleSeq = scan.next();
- 			Scanner scansingle = new Scanner(singleSeq);
-			if(!scansingle.hasNextLine()) {
- 				throw new RuntimeException("The input looks like an incomplete disembl file - cannot parse!");
- 			}
- 			
+			Scanner scansingle = new Scanner(singleSeq);
+			if (!scansingle.hasNextLine()) {
+				throw new RuntimeException(
+						"The input looks like an incomplete disembl file - cannot parse!");
+			}
+
 			StringBuffer seqbuffer = new StringBuffer();
 			ArrayList<Float> coils = new ArrayList<Float>();
 			ArrayList<Float> rem = new ArrayList<Float>();
 			ArrayList<Float> hotloops = new ArrayList<Float>();
 
 			String sequenceName = scansingle.nextLine().trim();
-			String title =  scansingle.nextLine();
+			TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,
+					scansingle.nextLine());
+			TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,
+					scansingle.nextLine());
+			TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,
+					scansingle.nextLine());
+
+			String title = scansingle.nextLine();
 			assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";
- 			
+
 			while (scansingle.hasNext()) {
 				seqbuffer.append(scansingle.next());
 				coils.add(scansingle.nextFloat());
 				rem.add(scansingle.nextFloat());
 				hotloops.add(scansingle.nextFloat());
 			}
-			FastaSequence fs = new FastaSequence(sequenceName,seqbuffer.toString());
-			Set<Score> scores = new HashSet<Score>();
-			scores.add(new Score(DisemblResultAnnot.COILS, coils));
-			scores.add(new Score(DisemblResultAnnot.HOTLOOPS, hotloops));
-			scores.add(new Score(DisemblResultAnnot.REM465, rem));
-			results.put(fs, scores);
+			/*
+			 * Also possible FastaSequence fs = new FastaSequence(sequenceName,
+			 * seqbuffer.toString());
+			 */
+			HashSet<Score> scores = new HashSet<Score>();
+			scores.add(new Score(DisemblResult.COILS, coils, coilsR));
+			scores.add(new Score(DisemblResult.REM465, rem, rem465R));
+			scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, loopsR));
+			results.put(sequenceName, scores);
 
 			scansingle.close();
 		}
-
+		scan.close();
 		input.close();
 		return results;
 	}
-	
-	public static  Map<String, Set<Score>> removeSequences(Map<FastaSequence, Set<Score>> disemblResults) { 
-		Map<String, Set<Score>> seqNameScores = new HashMap<String, Set<Score>>();
-		for(Map.Entry<FastaSequence,Set<Score>> dres: disemblResults.entrySet()) {
-			seqNameScores.put(dres.getKey().getId(),dres.getValue()); 
+
+	/**
+	 * Parsing:
+	 * 
+	 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,
+	 * 350-391, 429-485, 497-506, 539-547
+	 * 
+	 * # REM465 355-368
+	 * 
+	 * # HOTLOOPS 190-204
+	 * 
+	 * @param lines
+	 * @return
+	 */
+	private static TreeSet<Range> parseRanges(Enum resultType, String lines) {
+		TreeSet<Range> ranges = new TreeSet<Range>();
+
+		Scanner scan = new Scanner(lines);
+
+		assert scan.hasNext();
+		String del = scan.next();
+		assert "#".equals(del); // pass delimiter #
+		String type = scan.next(); // pass enum name e.g. COILS
+		assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "
+				+ resultType.toString();
+
+		// beginning of the ranges
+		scan.useDelimiter(",");
+		while (scan.hasNext()) {
+			String range = scan.next();
+			if (!Util.isEmpty(range)) {
+				ranges.add(new Range(range.split("-")));
+			}
+		}
+		return ranges;
+	}
+
+	/**
+	 * 
+	 > Foobar_dundeefriends
+	 * 
+	 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343
+	 * 
+	 * # REM465 355-368
+	 * 
+	 * # HOTLOOPS 190-204
+	 * 
+	 * # RESIDUE COILS REM465 HOTLOOPS
+	 * 
+	 * M 0.86010 0.88512 0.37094
+	 * 
+	 * T 0.79983 0.85864 0.44331
+	 * 
+	 * >Next Sequence name
+	 * 
+	 * 
+	 * @param input
+	 * @return Map key=sequence name, value=set of score
+	 * @throws IOException
+	 * @throws UnknownFileFormatException
+	 */
+	public static HashMap<String, Set<Score>> readGlobPlot(
+			final InputStream input) throws IOException,
+			UnknownFileFormatException {
+		Scanner scan = new Scanner(input);
+		scan.useDelimiter(">");
+		if (!scan.hasNext()) {
+			throw new UnknownFileFormatException(
+					"In GlobPlot score format each sequence score is expected "
+							+ "to start from the line: >Sequence name "
+							+ " No such line was found!");
+		}
+
+		HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();
+		int seqCounter = 0;
+		while (scan.hasNext()) {
+			seqCounter++;
+			String singleSeq = scan.next();
+			Scanner scansingle = new Scanner(singleSeq);
+			if (!scansingle.hasNextLine()) {
+				throw new RuntimeException(
+						"The input looks like an incomplete GlobPlot file - cannot parse!");
+			}
+
+			StringBuffer seqbuffer = new StringBuffer();
+			ArrayList<Float> dydxScore = new ArrayList<Float>();
+			ArrayList<Float> rawScore = new ArrayList<Float>();
+			ArrayList<Float> smoothedScore = new ArrayList<Float>();
+
+			String sequenceName = scansingle.nextLine().trim();
+			TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,
+					scansingle.nextLine());
+			TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,
+					scansingle.nextLine());
+
+			String title = scansingle.nextLine();
+			assert title.startsWith("# RESIDUE	DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";
+
+			while (scansingle.hasNext()) {
+				seqbuffer.append(scansingle.next());
+				dydxScore.add(scansingle.nextFloat());
+				rawScore.add(scansingle.nextFloat());
+				smoothedScore.add(scansingle.nextFloat());
+			}
+			/*
+			 * Also possible FastaSequence fs = new FastaSequence(sequenceName,
+			 * seqbuffer.toString());
+			 */
+			Set<Score> scores = new TreeSet<Score>();
+			scores.add(new Score(GlobProtResult.Disorder, disorderR));
+			scores.add(new Score(GlobProtResult.GlobDoms, domsR));
+			scores.add(new Score(GlobProtResult.Dydx, dydxScore));
+			scores.add(new Score(GlobProtResult.RawScore, rawScore));
+			scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));
+			results.put(sequenceName, scores);
+
+			scansingle.close();
 		}
-		return seqNameScores;
+		scan.close();
+		input.close();
+		return results;
 	}
-	
 	/**
 	 * Read AACon result with no alignment files. This method leaves incoming
-	 * the InputStream results open!
+	 * InputStream open!
 	 * 
 	 * @param results
 	 *            output file of AAConservation
@@ -554,3 +792,53 @@ public final class SequenceUtil {
 	}
 
 }
+
+enum DisemblResult {
+	/** These contains ranges and scores */
+	COILS, REM465, HOTLOOPS
+}
+enum GlobProtResult {
+	/** This a range with no scores */
+	GlobDoms,
+	/** This a range with no scores */
+	Disorder,
+	/** This a score with no range */
+	Dydx,
+	/** This a score with no range */
+	SmoothedScore,
+	/** This a score with no range */
+	RawScore
+}
+
+enum IUPredResult {
+	/**
+	 * Short disorder
+	 */
+	Short,
+	/**
+	 * Long disorder
+	 */
+	Long,
+	/**
+	 * Globular domains
+	 */
+	Glob;
+
+	static IUPredResult getType(File file) {
+		assert file != null;
+		String name = file.getName();
+		if (name.endsWith(Long.toString().toLowerCase())) {
+			return Long;
+		}
+		if (name.endsWith(Short.toString().toLowerCase())) {
+			return Short;
+		}
+		if (name.endsWith(Glob.toString().toLowerCase())) {
+			return Glob;
+		}
+		throw new AssertionError(
+				"IUPred result file type cannot be recognised! "
+						+ "\nFile must ends with one of [glob, long or short]"
+						+ "\n but given file name was: " + file.getName());
+	}
+}
\ No newline at end of file