-/*\r
- * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
- * Jalview Web Services version: 2.0 This library is free software; you can\r
- * redistribute it and/or modify it under the terms of the Apache License\r
- * version 2 as published by the Apache Software Foundation This library is\r
- * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
- * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
- * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
- * license is in apache_license.txt. It is also available here: see:\r
- * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
- * work distributed in source code form must include this copyright and license\r
- * notice.\r
+/* Copyright (c) 2011 Peter Troshin\r
+ * \r
+ * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0 \r
+ * \r
+ * This library is free software; you can redistribute it and/or modify it under the terms of the\r
+ * Apache License version 2 as published by the Apache Software Foundation\r
+ * \r
+ * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
+ * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
+ * License for more details.\r
+ * \r
+ * A copy of the license is in apache_license.txt. It is also available here:\r
+ * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * \r
+ * Any republication or derived work distributed in source code form\r
+ * must include this copyright and license notice.\r
*/\r
\r
package compbio.data.sequence;\r
import java.util.Map;\r
import java.util.Scanner;\r
import java.util.Set;\r
+import java.util.TreeSet;\r
import java.util.logging.Level;\r
import java.util.regex.Matcher;\r
import java.util.regex.Pattern;\r
\r
+import compbio.util.Util;\r
+\r
/**\r
* Utility class for operations on sequences\r
* \r
- * @author Petr Troshin\r
- * @version 1.0\r
+ * @author Peter Troshin\r
+ * @since 1.0\r
+ * @version 2.0 June 2011\r
*/\r
public final class SequenceUtil {\r
\r
private SequenceUtil() {\r
} // utility class, no instantiation\r
\r
- /*\r
- * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
- * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
- * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
- * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
- * SysPrefs.newlinechar); pir_out.close(); } public static void\r
- * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
- * BufferedWriter fasta_out = new BufferedWriter( new\r
- * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
- * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
- * SysPrefs.newlinechar); fasta_out.close(); }\r
- */\r
-\r
/**\r
* @return true is the sequence contains only letters a,c, t, g, u\r
*/\r
}\r
\r
/**\r
+ * Remove all non AA chars from the sequence\r
+ * \r
+ * @param sequence\r
+ * the sequence to clean\r
+ * @return cleaned sequence\r
+ */\r
+ public static String cleanProteinSequence(String sequence) {\r
+ return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
+ }\r
+\r
+ /**\r
* @param sequence\r
* @return true is the sequence is a protein sequence, false overwise\r
*/\r
public static List<FastaSequence> readFasta(final InputStream inStream)\r
throws IOException {\r
final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
-\r
- final BufferedReader infasta = new BufferedReader(\r
- new InputStreamReader(inStream, "UTF8"), 16000);\r
- final Pattern pattern = Pattern.compile("//s+");\r
-\r
- String line;\r
- String sname = "", seqstr = null;\r
- do {\r
- line = infasta.readLine();\r
- if ((line == null) || line.startsWith(">")) {\r
- if (seqstr != null) {\r
- seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
- }\r
- sname = line; // remove >\r
- seqstr = "";\r
- } else {\r
- final String subseq = pattern.matcher(line).replaceAll("");\r
- seqstr += subseq;\r
- }\r
- } while (line != null);\r
-\r
- infasta.close();\r
+ FastaReader reader = new FastaReader(inStream);\r
+ while (reader.hasNext()) {\r
+ seqs.add(reader.next());\r
+ }\r
+ inStream.close();\r
return seqs;\r
}\r
\r
outWriter.close();\r
}\r
\r
+ /**\r
+ * Read IUPred output\r
+ * \r
+ * @param result\r
+ * @return Map key->sequence name, value->Score\r
+ * @throws IOException\r
+ * @throws UnknownFileFormatException\r
+ */\r
+ public static Map<String, Score> readIUPred(final File result)\r
+ throws IOException, UnknownFileFormatException {\r
+ InputStream input = new FileInputStream(result);\r
+ Map<String, Score> sequences = readIUPred(input,\r
+ IUPredResult.getType(result));\r
+ input.close();\r
+ return sequences;\r
+ }\r
+\r
+ // Check the type of the file e.g. long| short or domain\r
+ // and read\r
+ /**\r
+ * ## Long Disorder\r
+ * \r
+ * # P53_HUMAN\r
+ * \r
+ * 1 M 0.9943\r
+ * \r
+ * 2 E 0.9917\r
+ * \r
+ * 3 E 0.9879\r
+ * \r
+ * (every line)\r
+ * \r
+ * @throws IOException\r
+ * @throws UnknownFileFormatException\r
+ * \r
+ * \r
+ */\r
+ private static Map<String, Score> readIUPred(InputStream input,\r
+ IUPredResult type) throws IOException, UnknownFileFormatException {\r
+\r
+ Score score = null;\r
+ final Map<String, Score> seqs = new HashMap<String, Score>();\r
+ Scanner scan = new Scanner(input);\r
+ scan.useDelimiter("#");\r
+ while (scan.hasNext()) {\r
+ String nextEntry = scan.next();\r
+ Scanner entry = new Scanner(nextEntry);\r
+ String name = entry.nextLine().trim();\r
+ // inside entry:\r
+ if (IUPredResult.Glob == type) {\r
+ // parse domains\r
+ TreeSet<Range> ranges = parseIUPredDomains(entry);\r
+ score = new Score(type, ranges);\r
+ } else {\r
+ // parse short | long\r
+ float[] scores = parseIUPredScores(entry);\r
+ score = new Score(type, scores);\r
+ }\r
+ entry.close();\r
+ seqs.put(name, score);\r
+ }\r
+\r
+ scan.close();\r
+ return seqs;\r
+ }\r
+\r
+ /**\r
+ * # P53_HUMA\r
+ * \r
+ * Number of globular domains: 2\r
+ * \r
+ * globular domain 1. 98 - 269\r
+ * \r
+ * globular domain 2. 431 - 482\r
+ * \r
+ * >P53_HUMA\r
+ * \r
+ * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
+ * \r
+ * @param scan\r
+ */\r
+ private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
+ String header = "Number of globular domains:";\r
+ String domainPref = "globular domain";\r
+ TreeSet<Range> ranges = new TreeSet<Range>();\r
+ String line = scan.nextLine().trim();\r
+ assert line.startsWith(header);\r
+ line = line.substring(header.length()).trim();\r
+ int domainNum = Integer.parseInt(line);\r
+ if (domainNum == 0) {\r
+ return ranges;\r
+ }\r
+\r
+ for (int i = 0; i < domainNum; i++) {\r
+ assert scan.hasNextLine();\r
+ line = scan.nextLine();\r
+ assert line.trim().startsWith(domainPref);\r
+ line = line.substring(line.indexOf(".") + 1).trim();\r
+ Range r = new Range(line.split("-"));\r
+ ranges.add(r);\r
+ }\r
+\r
+ return ranges;\r
+ }\r
+ /*\r
+ * 1 M 0.9943\r
+ * \r
+ * 2 E 0.9917\r
+ */\r
+ private static float[] parseIUPredScores(Scanner scan)\r
+ throws UnknownFileFormatException {\r
+ List<String> annotation = new ArrayList<String>();\r
+ while (scan.hasNextLine()) {\r
+ String line = scan.nextLine().trim();\r
+ String[] val = line.split("\\s+");\r
+ annotation.add(val[2]);\r
+ }\r
+ return convertToNumber(annotation\r
+ .toArray(new String[annotation.size()]));\r
+ }\r
+\r
public static Map<String, Score> readJRonn(final File result)\r
throws IOException, UnknownFileFormatException {\r
InputStream input = new FileInputStream(result);\r
* \r
* @param inStream\r
* the InputStream connected to the JRonn output file\r
- * @return List of {@link AnnotatedSequence} objects\r
+ * @return Map key=sequence name value=Score\r
* @throws IOException\r
* is thrown if the inStream has problems accessing the data\r
* @throws UnknownFileFormatException\r
infasta.close();\r
return seqs;\r
}\r
+\r
private static float[] convertToNumber(String[] annotValues)\r
throws UnknownFileFormatException {\r
float[] annotation = new float[annotValues.length];\r
\r
/**\r
* \r
- * TODO complete!\r
+ > Foobar_dundeefriends\r
* \r
- * >Sequence name\r
+ * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
* \r
- * RESIDUE COILS REM465 HOTLOOPS\r
+ * # REM465 355-368\r
* \r
- * M 0.86010 0.88512 0.37094\r
+ * # HOTLOOPS 190-204\r
* \r
- * T 0.79983 0.85864 0.44331 ....\r
-\r
- * >Next Sequence name \r
- * RESIDUE COILS REM465 HOTLOOPS\r
+ * # RESIDUE COILS REM465 HOTLOOPS\r
* \r
* M 0.86010 0.88512 0.37094\r
* \r
+ * T 0.79983 0.85864 0.44331\r
+ * \r
+ * >Next Sequence name\r
+ * \r
* \r
* @param input\r
- * @return\r
+ * the InputStream\r
+ * @return Map key=sequence name, value=set of score\r
* @throws IOException\r
* @throws UnknownFileFormatException\r
*/\r
- public static Map<FastaSequence, Set<Score>> readDisembl(final InputStream input)\r
- throws IOException, UnknownFileFormatException {\r
+ public static HashMap<String, Set<Score>> readDisembl(\r
+ final InputStream input) throws IOException,\r
+ UnknownFileFormatException {\r
Scanner scan = new Scanner(input);\r
scan.useDelimiter(">");\r
if (!scan.hasNext()) {\r
throw new UnknownFileFormatException(\r
- "In Disembl score format each sequence score is expected " +\r
- "to start from the line: >Sequence name "\r
+ "In Disembl score format each sequence score is expected "\r
+ + "to start from the line: >Sequence name "\r
+ " No such line was found!");\r
}\r
\r
- Map<FastaSequence, Set<Score>> results = new HashMap<FastaSequence, Set<Score>>();\r
+ HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
int seqCounter = 0;\r
while (scan.hasNext()) {\r
seqCounter++;\r
String singleSeq = scan.next();\r
- Scanner scansingle = new Scanner(singleSeq);\r
- if(!scansingle.hasNextLine()) {\r
- throw new RuntimeException("The input looks like an incomplete disembl file - cannot parse!");\r
- }\r
- \r
+ Scanner scansingle = new Scanner(singleSeq);\r
+ if (!scansingle.hasNextLine()) {\r
+ throw new RuntimeException(\r
+ "The input looks like an incomplete disembl file - cannot parse!");\r
+ }\r
+\r
StringBuffer seqbuffer = new StringBuffer();\r
ArrayList<Float> coils = new ArrayList<Float>();\r
ArrayList<Float> rem = new ArrayList<Float>();\r
ArrayList<Float> hotloops = new ArrayList<Float>();\r
\r
String sequenceName = scansingle.nextLine().trim();\r
- String title = scansingle.nextLine();\r
+ TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
+ scansingle.nextLine());\r
+ TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
+ scansingle.nextLine());\r
+ TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
+ scansingle.nextLine());\r
+\r
+ String title = scansingle.nextLine();\r
assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
- \r
+\r
while (scansingle.hasNext()) {\r
seqbuffer.append(scansingle.next());\r
coils.add(scansingle.nextFloat());\r
rem.add(scansingle.nextFloat());\r
hotloops.add(scansingle.nextFloat());\r
}\r
- FastaSequence fs = new FastaSequence(sequenceName,seqbuffer.toString());\r
- Set<Score> scores = new HashSet<Score>();\r
- scores.add(new Score(DisemblResultAnnot.COILS, coils));\r
- scores.add(new Score(DisemblResultAnnot.HOTLOOPS, hotloops));\r
- scores.add(new Score(DisemblResultAnnot.REM465, rem));\r
- results.put(fs, scores);\r
+ /*\r
+ * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
+ * seqbuffer.toString());\r
+ */\r
+ HashSet<Score> scores = new HashSet<Score>();\r
+ scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
+ scores.add(new Score(DisemblResult.REM465, rem, rem465R));\r
+ scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, loopsR));\r
+ results.put(sequenceName, scores);\r
\r
scansingle.close();\r
}\r
-\r
+ scan.close();\r
input.close();\r
return results;\r
}\r
- \r
- public static Map<String, Set<Score>> removeSequences(Map<FastaSequence, Set<Score>> disemblResults) { \r
- Map<String, Set<Score>> seqNameScores = new HashMap<String, Set<Score>>();\r
- for(Map.Entry<FastaSequence,Set<Score>> dres: disemblResults.entrySet()) {\r
- seqNameScores.put(dres.getKey().getId(),dres.getValue()); \r
+\r
+ /**\r
+ * Parsing:\r
+ * \r
+ * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
+ * 350-391, 429-485, 497-506, 539-547\r
+ * \r
+ * # REM465 355-368\r
+ * \r
+ * # HOTLOOPS 190-204\r
+ * \r
+ * @param lines\r
+ * @return\r
+ */\r
+ private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
+ TreeSet<Range> ranges = new TreeSet<Range>();\r
+\r
+ Scanner scan = new Scanner(lines);\r
+\r
+ assert scan.hasNext();\r
+ String del = scan.next();\r
+ assert "#".equals(del); // pass delimiter #\r
+ String type = scan.next(); // pass enum name e.g. COILS\r
+ assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
+ + resultType.toString();\r
+\r
+ // beginning of the ranges\r
+ scan.useDelimiter(",");\r
+ while (scan.hasNext()) {\r
+ String range = scan.next();\r
+ if (!Util.isEmpty(range)) {\r
+ ranges.add(new Range(range.split("-")));\r
+ }\r
+ }\r
+ return ranges;\r
+ }\r
+\r
+ /**\r
+ * \r
+ > Foobar_dundeefriends\r
+ * \r
+ * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
+ * \r
+ * # REM465 355-368\r
+ * \r
+ * # HOTLOOPS 190-204\r
+ * \r
+ * # RESIDUE COILS REM465 HOTLOOPS\r
+ * \r
+ * M 0.86010 0.88512 0.37094\r
+ * \r
+ * T 0.79983 0.85864 0.44331\r
+ * \r
+ * >Next Sequence name\r
+ * \r
+ * \r
+ * @param input\r
+ * @return Map key=sequence name, value=set of score\r
+ * @throws IOException\r
+ * @throws UnknownFileFormatException\r
+ */\r
+ public static HashMap<String, Set<Score>> readGlobPlot(\r
+ final InputStream input) throws IOException,\r
+ UnknownFileFormatException {\r
+ Scanner scan = new Scanner(input);\r
+ scan.useDelimiter(">");\r
+ if (!scan.hasNext()) {\r
+ throw new UnknownFileFormatException(\r
+ "In GlobPlot score format each sequence score is expected "\r
+ + "to start from the line: >Sequence name "\r
+ + " No such line was found!");\r
+ }\r
+\r
+ HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
+ int seqCounter = 0;\r
+ while (scan.hasNext()) {\r
+ seqCounter++;\r
+ String singleSeq = scan.next();\r
+ Scanner scansingle = new Scanner(singleSeq);\r
+ if (!scansingle.hasNextLine()) {\r
+ throw new RuntimeException(\r
+ "The input looks like an incomplete GlobPlot file - cannot parse!");\r
+ }\r
+\r
+ StringBuffer seqbuffer = new StringBuffer();\r
+ ArrayList<Float> dydxScore = new ArrayList<Float>();\r
+ ArrayList<Float> rawScore = new ArrayList<Float>();\r
+ ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
+\r
+ String sequenceName = scansingle.nextLine().trim();\r
+ TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
+ scansingle.nextLine());\r
+ TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
+ scansingle.nextLine());\r
+\r
+ String title = scansingle.nextLine();\r
+ assert title.startsWith("# RESIDUE DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
+\r
+ while (scansingle.hasNext()) {\r
+ seqbuffer.append(scansingle.next());\r
+ dydxScore.add(scansingle.nextFloat());\r
+ rawScore.add(scansingle.nextFloat());\r
+ smoothedScore.add(scansingle.nextFloat());\r
+ }\r
+ /*\r
+ * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
+ * seqbuffer.toString());\r
+ */\r
+ Set<Score> scores = new TreeSet<Score>();\r
+ scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
+ scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
+ scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
+ scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
+ scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
+ results.put(sequenceName, scores);\r
+\r
+ scansingle.close();\r
}\r
- return seqNameScores;\r
+ scan.close();\r
+ input.close();\r
+ return results;\r
}\r
- \r
/**\r
* Read AACon result with no alignment files. This method leaves incoming\r
- * the InputStream results open!\r
+ * InputStream open!\r
* \r
* @param results\r
* output file of AAConservation\r
}\r
\r
}\r
+\r
+enum DisemblResult {\r
+ /** These contains ranges and scores */\r
+ COILS, REM465, HOTLOOPS\r
+}\r
+enum GlobProtResult {\r
+ /** This a range with no scores */\r
+ GlobDoms,\r
+ /** This a range with no scores */\r
+ Disorder,\r
+ /** This a score with no range */\r
+ Dydx,\r
+ /** This a score with no range */\r
+ SmoothedScore,\r
+ /** This a score with no range */\r
+ RawScore\r
+}\r
+\r
+enum IUPredResult {\r
+ /**\r
+ * Short disorder\r
+ */\r
+ Short,\r
+ /**\r
+ * Long disorder\r
+ */\r
+ Long,\r
+ /**\r
+ * Globular domains\r
+ */\r
+ Glob;\r
+\r
+ static IUPredResult getType(File file) {\r
+ assert file != null;\r
+ String name = file.getName();\r
+ if (name.endsWith(Long.toString().toLowerCase())) {\r
+ return Long;\r
+ }\r
+ if (name.endsWith(Short.toString().toLowerCase())) {\r
+ return Short;\r
+ }\r
+ if (name.endsWith(Glob.toString().toLowerCase())) {\r
+ return Glob;\r
+ }\r
+ throw new AssertionError(\r
+ "IUPred result file type cannot be recognised! "\r
+ + "\nFile must ends with one of [glob, long or short]"\r
+ + "\n but given file name was: " + file.getName());\r
+ }\r
+}
\ No newline at end of file