-/* \r
- * @(#)SequenceUtil.java 1.0 September 2009\r
- * \r
- * Copyright (c) 2009 Peter Troshin\r
- * \r
- * Jalview Web Services version: 2.0 \r
- * \r
- * This library is free software; you can redistribute it and/or modify it under the terms of the\r
- * Apache License version 2 as published by the Apache Software Foundation\r
- * \r
- * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
- * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
- * License for more details.\r
- * \r
- * A copy of the license is in apache_license.txt. It is also available here:\r
- * see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
- * \r
- * Any republication or derived work distributed in source code form\r
- * must include this copyright and license notice.\r
+/*\r
+ * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
+ * Jalview Web Services version: 2.0 This library is free software; you can\r
+ * redistribute it and/or modify it under the terms of the Apache License\r
+ * version 2 as published by the Apache Software Foundation This library is\r
+ * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
+ * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
+ * license is in apache_license.txt. It is also available here: see:\r
+ * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
+ * work distributed in source code form must include this copyright and license\r
+ * notice.\r
*/\r
\r
package compbio.data.sequence;\r
import java.io.OutputStream;\r
import java.io.OutputStreamWriter;\r
import java.util.ArrayList;\r
+import java.util.HashMap;\r
+import java.util.HashSet;\r
import java.util.List;\r
+import java.util.Map;\r
+import java.util.Scanner;\r
+import java.util.TreeSet;\r
import java.util.logging.Level;\r
import java.util.regex.Matcher;\r
import java.util.regex.Pattern;\r
\r
+import compbio.util.Util;\r
+\r
/**\r
* Utility class for operations on sequences\r
* \r
*/\r
public final class SequenceUtil {\r
\r
- /**\r
- * A whitespace character: [\t\n\x0B\f\r]\r
- */\r
- public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
-\r
- /**\r
- * A digit\r
- */\r
- public static final Pattern DIGIT = Pattern.compile("\\d");\r
-\r
- /**\r
- * Non word\r
- */\r
- public static final Pattern NONWORD = Pattern.compile("\\W");\r
-\r
- /**\r
- * Valid Amino acids\r
- */\r
- public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
- Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * inversion of AA pattern\r
- */\r
- public static final Pattern NON_AA = Pattern.compile(\r
- "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * Same as AA pattern but with two additional letters - XU\r
- */\r
- public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
- "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * Nucleotides a, t, g, c, u\r
- */\r
- public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
- Pattern.CASE_INSENSITIVE);\r
-\r
- /**\r
- * Ambiguous nucleotide\r
- */\r
- public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
- "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
- /**\r
- * Non nucleotide\r
- */\r
- public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
- Pattern.CASE_INSENSITIVE);\r
-\r
- private SequenceUtil() {\r
- } // utility class, no instantiation\r
-\r
- /*\r
- * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
- * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
- * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
- * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
- * SysPrefs.newlinechar); pir_out.close(); }\r
- * \r
- * public static void write_FastaSeq(OutputStream os, FastaSequence seq)\r
- * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new\r
- * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
- * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
- * SysPrefs.newlinechar); fasta_out.close(); }\r
- */\r
-\r
- /**\r
- * @return true is the sequence contains only letters a,c, t, g, u\r
- */\r
- public static boolean isNucleotideSequence(final FastaSequence s) {\r
- return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
- }\r
-\r
- /**\r
- * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
- * (!) - B char\r
- */\r
- public static boolean isNonAmbNucleotideSequence(String sequence) {\r
- sequence = SequenceUtil.cleanSequence(sequence);\r
- if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
- return false;\r
+ /**\r
+ * A whitespace character: [\t\n\x0B\f\r]\r
+ */\r
+ public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
+\r
+ /**\r
+ * A digit\r
+ */\r
+ public static final Pattern DIGIT = Pattern.compile("\\d");\r
+\r
+ /**\r
+ * Non word\r
+ */\r
+ public static final Pattern NONWORD = Pattern.compile("\\W");\r
+\r
+ /**\r
+ * Valid Amino acids\r
+ */\r
+ public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
+ Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * inversion of AA pattern\r
+ */\r
+ public static final Pattern NON_AA = Pattern.compile(\r
+ "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * Same as AA pattern but with two additional letters - XU\r
+ */\r
+ public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
+ "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * Nucleotides a, t, g, c, u\r
+ */\r
+ public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
+ Pattern.CASE_INSENSITIVE);\r
+\r
+ /**\r
+ * Ambiguous nucleotide\r
+ */\r
+ public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
+ "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
+ /**\r
+ * Non nucleotide\r
+ */\r
+ public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
+ Pattern.CASE_INSENSITIVE);\r
+\r
+ private SequenceUtil() {\r
+ } // utility class, no instantiation\r
+\r
+ /*\r
+ * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
+ * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
+ * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
+ * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
+ * SysPrefs.newlinechar); pir_out.close(); } public static void\r
+ * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
+ * BufferedWriter fasta_out = new BufferedWriter( new\r
+ * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
+ * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
+ * SysPrefs.newlinechar); fasta_out.close(); }\r
+ */\r
+\r
+ /**\r
+ * @return true is the sequence contains only letters a,c, t, g, u\r
+ */\r
+ public static boolean isNucleotideSequence(final FastaSequence s) {\r
+ return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
+ }\r
+\r
+ /**\r
+ * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
+ * (!) - B char\r
+ */\r
+ public static boolean isNonAmbNucleotideSequence(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
+ return false;\r
+ /*\r
+ * System.out.format("I found the text starting at " +\r
+ * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
+ * nonDNAmatcher.end());\r
+ */\r
+ }\r
+ final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
+ return DNAmatcher.find();\r
+ }\r
+\r
+ /**\r
+ * Removes all whitespace chars in the sequence string\r
+ * \r
+ * @param sequence\r
+ * @return cleaned up sequence\r
+ */\r
+ public static String cleanSequence(String sequence) {\r
+ assert sequence != null;\r
+ final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
+ sequence = m.replaceAll("").toUpperCase();\r
+ return sequence;\r
+ }\r
+\r
+ /**\r
+ * Removes all special characters and digits as well as whitespace chars\r
+ * from the sequence\r
+ * \r
+ * @param sequence\r
+ * @return cleaned up sequence\r
+ */\r
+ public static String deepCleanSequence(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
+ sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
+ final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
+ sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
+ return sequence;\r
+ }\r
+\r
+ /**\r
+ * @param sequence\r
+ * @return true is the sequence is a protein sequence, false overwise\r
+ */\r
+ public static boolean isProteinSequence(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
+ return protmatcher.find();\r
+ }\r
+\r
+ /**\r
+ * Check whether the sequence confirms to amboguous protein sequence\r
+ * \r
+ * @param sequence\r
+ * @return return true only if the sequence if ambiguous protein sequence\r
+ * Return false otherwise. e.g. if the sequence is non-ambiguous\r
+ * protein or DNA\r
+ */\r
+ public static boolean isAmbiguosProtein(String sequence) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ if (SequenceUtil.AA.matcher(sequence).find()) {\r
+ return false;\r
+ }\r
+ final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
+ return amb_prot.find();\r
+ }\r
+\r
+ /**\r
+ * Writes list of FastaSequeces into the outstream formatting the sequence\r
+ * so that it contains width chars on each line\r
+ * \r
+ * @param outstream\r
+ * @param sequences\r
+ * @param width\r
+ * - the maximum number of characters to write in one line\r
+ * @throws IOException\r
+ */\r
+ public static void writeFasta(final OutputStream outstream,\r
+ final List<FastaSequence> sequences, final int width)\r
+ throws IOException {\r
+ writeFastaKeepTheStream(outstream, sequences, width);\r
+ outstream.close();\r
+ }\r
+\r
+ public static void writeFastaKeepTheStream(final OutputStream outstream,\r
+ final List<FastaSequence> sequences, final int width)\r
+ throws IOException {\r
+ final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
+ final BufferedWriter fastawriter = new BufferedWriter(writer);\r
+ for (final FastaSequence fs : sequences) {\r
+ fastawriter.write(">" + fs.getId() + "\n");\r
+ fastawriter.write(fs.getFormatedSequence(width));\r
+ fastawriter.write("\n");\r
+ }\r
+ fastawriter.flush();\r
+ writer.flush();\r
}\r
- if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
- return false;\r
- /*\r
- * System.out.format("I found the text starting at " +\r
- * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
- * nonDNAmatcher.end());\r
- */\r
+\r
+ /**\r
+ * Reads fasta sequences from inStream into the list of FastaSequence\r
+ * objects\r
+ * \r
+ * @param inStream\r
+ * from\r
+ * @return list of FastaSequence objects\r
+ * @throws IOException\r
+ */\r
+ public static List<FastaSequence> readFasta(final InputStream inStream)\r
+ throws IOException {\r
+ final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
+\r
+ final BufferedReader infasta = new BufferedReader(\r
+ new InputStreamReader(inStream, "UTF8"), 16000);\r
+ final Pattern pattern = Pattern.compile("//s+");\r
+\r
+ String line;\r
+ String sname = "", seqstr = null;\r
+ do {\r
+ line = infasta.readLine();\r
+ if ((line == null) || line.startsWith(">")) {\r
+ if (seqstr != null) {\r
+ seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
+ }\r
+ sname = line; // remove >\r
+ seqstr = "";\r
+ } else {\r
+ final String subseq = pattern.matcher(line).replaceAll("");\r
+ seqstr += subseq;\r
+ }\r
+ } while (line != null);\r
+\r
+ infasta.close();\r
+ return seqs;\r
}\r
- final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
- return DNAmatcher.find();\r
- }\r
-\r
- /**\r
- * Removes all whitespace chars in the sequence string\r
- * \r
- * @param sequence\r
- * @return cleaned up sequence\r
- */\r
- public static String cleanSequence(String sequence) {\r
- assert sequence != null;\r
- final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
- sequence = m.replaceAll("").toUpperCase();\r
- return sequence;\r
- }\r
-\r
- /**\r
- * Removes all special characters and digits as well as whitespace chars\r
- * from the sequence\r
- * \r
- * @param sequence\r
- * @return cleaned up sequence\r
- */\r
- public static String deepCleanSequence(String sequence) {\r
- sequence = SequenceUtil.cleanSequence(sequence);\r
- sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
- sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
- final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
- sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
- return sequence;\r
- }\r
-\r
- /**\r
- * \r
- * @param sequence\r
- * @return true is the sequence is a protein sequence, false overwise\r
- */\r
- public static boolean isProteinSequence(String sequence) {\r
- sequence = SequenceUtil.cleanSequence(sequence);\r
- if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
- return false;\r
+\r
+ /**\r
+ * Writes FastaSequence in the file, each sequence will take one line only\r
+ * \r
+ * @param os\r
+ * @param sequences\r
+ * @throws IOException\r
+ */\r
+ public static void writeFasta(final OutputStream os,\r
+ final List<FastaSequence> sequences) throws IOException {\r
+ final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
+ final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
+ for (final FastaSequence fs : sequences) {\r
+ fasta_out.write(fs.getOnelineFasta());\r
+ }\r
+ fasta_out.close();\r
+ outWriter.close();\r
}\r
- if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
- return false;\r
+\r
+ public static Map<String, Score> readJRonn(final File result)\r
+ throws IOException, UnknownFileFormatException {\r
+ InputStream input = new FileInputStream(result);\r
+ Map<String, Score> sequences = readJRonn(input);\r
+ input.close();\r
+ return sequences;\r
}\r
- if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
- return false;\r
+\r
+ /**\r
+ * Reader for JRonn horizontal file format\r
+ * \r
+ * <pre>\r
+ * >Foobar M G D T T A G 0.48 0.42\r
+ * 0.42 0.48 0.52 0.53 0.54\r
+ * \r
+ * <pre>\r
+ * Where all values are tab delimited\r
+ * \r
+ * @param inStream\r
+ * the InputStream connected to the JRonn output file\r
+ * @return List of {@link AnnotatedSequence} objects\r
+ * @throws IOException\r
+ * is thrown if the inStream has problems accessing the data\r
+ * @throws UnknownFileFormatException\r
+ * is thrown if the inStream represents an unknown source of\r
+ * data, i.e. not a JRonn output\r
+ */\r
+ public static Map<String, Score> readJRonn(final InputStream inStream)\r
+ throws IOException, UnknownFileFormatException {\r
+ final Map<String, Score> seqs = new HashMap<String, Score>();\r
+\r
+ final BufferedReader infasta = new BufferedReader(\r
+ new InputStreamReader(inStream, "UTF8"), 16000);\r
+\r
+ String line;\r
+ String sname = "";\r
+ do {\r
+ line = infasta.readLine();\r
+ if (line == null || line.isEmpty()) {\r
+ // skip empty lines\r
+ continue;\r
+ }\r
+ if (line.startsWith(">")) {\r
+ // read name\r
+ sname = line.trim().substring(1);\r
+ // read sequence line\r
+ line = infasta.readLine();\r
+ final String sequence = line.replace("\t", "");\r
+ // read annotation line\r
+ line = infasta.readLine();\r
+ String[] annotValues = line.split("\t");\r
+ float[] annotation = convertToNumber(annotValues);\r
+ if (annotation.length != sequence.length()) {\r
+ throw new UnknownFileFormatException(\r
+ "File does not look like Jronn horizontally formatted output file!\n"\r
+ + JRONN_WRONG_FORMAT_MESSAGE);\r
+ }\r
+ seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
+ }\r
+ } while (line != null);\r
+\r
+ infasta.close();\r
+ return seqs;\r
}\r
- final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
- return protmatcher.find();\r
- }\r
-\r
- /**\r
- * Check whether the sequence confirms to amboguous protein sequence\r
- * \r
- * @param sequence\r
- * @return return true only if the sequence if ambiguous protein sequence\r
- * Return false otherwise. e.g. if the sequence is non-ambiguous\r
- * protein or DNA\r
- */\r
- public static boolean isAmbiguosProtein(String sequence) {\r
- sequence = SequenceUtil.cleanSequence(sequence);\r
- if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
- return false;\r
+\r
+ private static float[] convertToNumber(String[] annotValues)\r
+ throws UnknownFileFormatException {\r
+ float[] annotation = new float[annotValues.length];\r
+ try {\r
+ for (int i = 0; i < annotation.length; i++) {\r
+ annotation[i] = Float.parseFloat(annotValues[i]);\r
+ }\r
+ } catch (NumberFormatException e) {\r
+ throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
+ e.getCause());\r
+ }\r
+ return annotation;\r
}\r
- if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
- return false;\r
+\r
+ private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
+ + ">sequence_name\n "\r
+ + "M V S\n"\r
+ + "0.43 0.22 0.65\n"\r
+ + "Where first line is the sequence name,\n"\r
+ + "second line is the tab delimited sequence,\n"\r
+ + "third line contains tab delimited disorder prediction values.\n"\r
+ + "No lines are allowed between these three. Additionally, the number of "\r
+ + "sequence residues must be equal to the number of the disorder values.";\r
+\r
+ /**\r
+ * Closes the Closable and logs the exception if any\r
+ * \r
+ * @param log\r
+ * @param stream\r
+ */\r
+ public final static void closeSilently(java.util.logging.Logger log,\r
+ Closeable stream) {\r
+ if (stream != null) {\r
+ try {\r
+ stream.close();\r
+ } catch (IOException e) {\r
+ log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
+ }\r
+ }\r
}\r
- if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
- return false;\r
+\r
+ /**\r
+ * \r
+ > Foobar_dundeefriends\r
+ * \r
+ * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
+ * \r
+ * # REM465 355-368\r
+ * \r
+ * # HOTLOOPS 190-204\r
+ * \r
+ * # RESIDUE COILS REM465 HOTLOOPS\r
+ * \r
+ * M 0.86010 0.88512 0.37094\r
+ * \r
+ * T 0.79983 0.85864 0.44331\r
+ * \r
+ * >Next Sequence name\r
+ * \r
+ * \r
+ * @param input\r
+ * @return\r
+ * @throws IOException\r
+ * @throws UnknownFileFormatException\r
+ */\r
+ public static HashMap<FastaSequence, HashSet<Score>> readDisembl(\r
+ final InputStream input) throws IOException,\r
+ UnknownFileFormatException {\r
+ Scanner scan = new Scanner(input);\r
+ scan.useDelimiter(">");\r
+ if (!scan.hasNext()) {\r
+ throw new UnknownFileFormatException(\r
+ "In Disembl score format each sequence score is expected "\r
+ + "to start from the line: >Sequence name "\r
+ + " No such line was found!");\r
+ }\r
+\r
+ HashMap<FastaSequence, HashSet<Score>> results = new HashMap<FastaSequence, HashSet<Score>>();\r
+ int seqCounter = 0;\r
+ while (scan.hasNext()) {\r
+ seqCounter++;\r
+ String singleSeq = scan.next();\r
+ Scanner scansingle = new Scanner(singleSeq);\r
+ if (!scansingle.hasNextLine()) {\r
+ throw new RuntimeException(\r
+ "The input looks like an incomplete disembl file - cannot parse!");\r
+ }\r
+\r
+ StringBuffer seqbuffer = new StringBuffer();\r
+ ArrayList<Float> coils = new ArrayList<Float>();\r
+ ArrayList<Float> rem = new ArrayList<Float>();\r
+ ArrayList<Float> hotloops = new ArrayList<Float>();\r
+\r
+ String sequenceName = scansingle.nextLine().trim();\r
+ TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
+ scansingle.nextLine());\r
+ TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
+ scansingle.nextLine());\r
+ TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
+ scansingle.nextLine());\r
+\r
+ String title = scansingle.nextLine();\r
+ assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
+\r
+ while (scansingle.hasNext()) {\r
+ seqbuffer.append(scansingle.next());\r
+ coils.add(scansingle.nextFloat());\r
+ rem.add(scansingle.nextFloat());\r
+ hotloops.add(scansingle.nextFloat());\r
+ }\r
+ FastaSequence fs = new FastaSequence(sequenceName,\r
+ seqbuffer.toString());\r
+ HashSet<Score> scores = new HashSet<Score>();\r
+ scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
+ scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
+ scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
+ results.put(fs, scores);\r
+\r
+ scansingle.close();\r
+ }\r
+ scan.close();\r
+ input.close();\r
+ return results;\r
}\r
- if (SequenceUtil.AA.matcher(sequence).find()) {\r
- return false;\r
+\r
+ /**\r
+ * Parsing:\r
+ * \r
+ * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
+ * 350-391, 429-485, 497-506, 539-547\r
+ * \r
+ * # REM465 355-368\r
+ * \r
+ * # HOTLOOPS 190-204\r
+ * \r
+ * @param lines\r
+ * @return\r
+ */\r
+ private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
+ TreeSet<Range> ranges = new TreeSet<Range>();\r
+\r
+ Scanner scan = new Scanner(lines);\r
+\r
+ assert scan.hasNext();\r
+ String del = scan.next();\r
+ assert "#".equals(del); // pass delimiter #\r
+ String type = scan.next(); // pass enum name e.g. COILS\r
+ assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
+ + resultType.toString();\r
+\r
+ // beginning of the ranges\r
+ scan.useDelimiter(",");\r
+ while (scan.hasNext()) {\r
+ String range = scan.next();\r
+ if (!Util.isEmpty(range)) {\r
+ ranges.add(new Range(range.split("-")));\r
+ }\r
+ }\r
+ return ranges;\r
}\r
- final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
- return amb_prot.find();\r
- }\r
-\r
- /**\r
- * Writes list of FastaSequeces into the outstream formatting the sequence\r
- * so that it contains width chars on each line\r
- * \r
- * @param outstream\r
- * @param sequences\r
- * @param width\r
- * - the maximum number of characters to write in one line\r
- * @throws IOException\r
- */\r
- public static void writeFasta(final OutputStream outstream,\r
- final List<FastaSequence> sequences, final int width)\r
- throws IOException {\r
- final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
- final BufferedWriter fastawriter = new BufferedWriter(writer);\r
- for (final FastaSequence fs : sequences) {\r
- fastawriter.write(fs.getFormatedSequence(width));\r
+\r
+ public static HashMap<String, HashSet<Score>> removeSequences(\r
+ HashMap<FastaSequence, HashSet<Score>> disemblResults) {\r
+ HashMap<String, HashSet<Score>> seqNameScores = new HashMap<String, HashSet<Score>>();\r
+ for (Map.Entry<FastaSequence, HashSet<Score>> dres : disemblResults\r
+ .entrySet()) {\r
+ seqNameScores.put(dres.getKey().getId(), dres.getValue());\r
+ }\r
+ return seqNameScores;\r
}\r
- outstream.flush();\r
- fastawriter.close();\r
- writer.close();\r
- }\r
-\r
- /**\r
- * Reads fasta sequences from inStream into the list of FastaSequence\r
- * objects\r
- * \r
- * @param inStream\r
- * from\r
- * @return list of FastaSequence objects\r
- * @throws IOException\r
- */\r
- public static List<FastaSequence> readFasta(final InputStream inStream)\r
- throws IOException {\r
- final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
-\r
- final BufferedReader infasta = new BufferedReader(\r
- new InputStreamReader(inStream, "UTF8"), 16000);\r
- final Pattern pattern = Pattern.compile("//s+");\r
-\r
- String line;\r
- String sname = "", seqstr = null;\r
- do {\r
- line = infasta.readLine();\r
- if ((line == null) || line.startsWith(">")) {\r
- if (seqstr != null) {\r
- seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
+\r
+ /**\r
+ * \r
+ > Foobar_dundeefriends\r
+ * \r
+ * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
+ * \r
+ * # REM465 355-368\r
+ * \r
+ * # HOTLOOPS 190-204\r
+ * \r
+ * # RESIDUE COILS REM465 HOTLOOPS\r
+ * \r
+ * M 0.86010 0.88512 0.37094\r
+ * \r
+ * T 0.79983 0.85864 0.44331\r
+ * \r
+ * >Next Sequence name\r
+ * \r
+ * \r
+ * @param input\r
+ * @return\r
+ * @throws IOException\r
+ * @throws UnknownFileFormatException\r
+ */\r
+ public static HashMap<FastaSequence, HashSet<Score>> readGlobPlot(\r
+ final InputStream input) throws IOException,\r
+ UnknownFileFormatException {\r
+ Scanner scan = new Scanner(input);\r
+ scan.useDelimiter(">");\r
+ if (!scan.hasNext()) {\r
+ throw new UnknownFileFormatException(\r
+ "In GlobPlot score format each sequence score is expected "\r
+ + "to start from the line: >Sequence name "\r
+ + " No such line was found!");\r
+ }\r
+\r
+ HashMap<FastaSequence, HashSet<Score>> results = new HashMap<FastaSequence, HashSet<Score>>();\r
+ int seqCounter = 0;\r
+ while (scan.hasNext()) {\r
+ seqCounter++;\r
+ String singleSeq = scan.next();\r
+ Scanner scansingle = new Scanner(singleSeq);\r
+ if (!scansingle.hasNextLine()) {\r
+ throw new RuntimeException(\r
+ "The input looks like an incomplete GlobPlot file - cannot parse!");\r
+ }\r
+\r
+ StringBuffer seqbuffer = new StringBuffer();\r
+ ArrayList<Float> dydxScore = new ArrayList<Float>();\r
+ ArrayList<Float> rawScore = new ArrayList<Float>();\r
+ ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
+\r
+ String sequenceName = scansingle.nextLine().trim();\r
+ TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
+ scansingle.nextLine());\r
+ TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
+ scansingle.nextLine());\r
+\r
+ String title = scansingle.nextLine();\r
+ assert title.startsWith("# RESIDUE DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
+\r
+ while (scansingle.hasNext()) {\r
+ seqbuffer.append(scansingle.next());\r
+ dydxScore.add(scansingle.nextFloat());\r
+ rawScore.add(scansingle.nextFloat());\r
+ smoothedScore.add(scansingle.nextFloat());\r
+ }\r
+ FastaSequence fs = new FastaSequence(sequenceName,\r
+ seqbuffer.toString());\r
+ HashSet<Score> scores = new HashSet<Score>();\r
+ scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
+ scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
+ scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
+ scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
+ scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
+ results.put(fs, scores);\r
+\r
+ scansingle.close();\r
}\r
- sname = line; // remove >\r
- seqstr = "";\r
- } else {\r
- final String subseq = pattern.matcher(line).replaceAll("");\r
- seqstr += subseq;\r
- }\r
- } while (line != null);\r
-\r
- infasta.close();\r
- return seqs;\r
- }\r
-\r
- /**\r
- * Writes FastaSequence in the file, each sequence will take one line only\r
- * \r
- * @param os\r
- * @param sequences\r
- * @throws IOException\r
- */\r
- public static void writeFasta(final OutputStream os,\r
- final List<FastaSequence> sequences) throws IOException {\r
- final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
- final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
- for (final FastaSequence fs : sequences) {\r
- fasta_out.write(fs.getOnelineFasta());\r
+ scan.close();\r
+ input.close();\r
+ return results;\r
}\r
- fasta_out.close();\r
- outWriter.close();\r
- }\r
-\r
- public static List<AnnotatedSequence> readJRonn(final File result)\r
- throws IOException, UnknownFileFormatException {\r
- InputStream input = new FileInputStream(result);\r
- List<AnnotatedSequence> sequences = readJRonn(input);\r
- input.close();\r
- return sequences;\r
- }\r
-\r
- /**\r
- * Reader for JRonn horizontal file format\r
- * \r
- * >Foobar\r
- * \r
- * M G D T T A G\r
- * \r
- * 0.48 0.42 0.42 0.48 0.52 0.53 0.54\r
- * \r
- * All values are tab delimited\r
- * \r
- * @param inStream\r
- * @return\r
- * @throws IOException\r
- * @throws UnknownFileFormatException\r
- */\r
- public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
- throws IOException, UnknownFileFormatException {\r
- final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
-\r
- final BufferedReader infasta = new BufferedReader(\r
- new InputStreamReader(inStream, "UTF8"), 16000);\r
-\r
- String line;\r
- String sname = "";\r
- do {\r
- line = infasta.readLine();\r
- if (line == null || line.isEmpty()) {\r
- // skip empty lines\r
- continue;\r
- }\r
- if (line.startsWith(">")) {\r
- // read name\r
- sname = line.trim().substring(1);\r
- // read sequence line\r
- line = infasta.readLine();\r
- final String sequence = line.replace("\t", "");\r
- // read annotation line\r
- line = infasta.readLine();\r
- String[] annotValues = line.split("\t");\r
- float[] annotation = convertToNumber(annotValues);\r
- if (annotation.length != sequence.length()) {\r
- throw new UnknownFileFormatException(\r
- "File does not look like Jronn horizontally formatted output file!\n"\r
- + JRONN_WRONG_FORMAT_MESSAGE);\r
+ /**\r
+ * Read AACon result with no alignment files. This method leaves incoming\r
+ * the InputStream results open!\r
+ * \r
+ * @param results\r
+ * output file of AAConservation\r
+ * @return Map with keys {@link ConservationMethod} -> float[]\r
+ */\r
+ public static HashSet<Score> readAAConResults(InputStream results) {\r
+ if (results == null) {\r
+ throw new NullPointerException(\r
+ "InputStream with results must be provided");\r
+ }\r
+ HashSet<Score> annotations = new HashSet<Score>();\r
+ Scanner sc = new Scanner(results);\r
+ sc.useDelimiter("#");\r
+ while (sc.hasNext()) {\r
+ String line = sc.next();\r
+ int spacePos = line.indexOf(" ");\r
+ assert spacePos > 0 : "Space is expected as delimited between method "\r
+ + "name and values!";\r
+ String methodLine = line.substring(0, spacePos);\r
+ ConservationMethod method = ConservationMethod\r
+ .getMethod(methodLine);\r
+ assert method != null : "Method " + methodLine\r
+ + " is not recognized! ";\r
+ Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
+ ArrayList<Float> values = new ArrayList<Float>();\r
+ while (valuesScanner.hasNextDouble()) {\r
+ Double value = valuesScanner.nextDouble();\r
+ values.add(value.floatValue());\r
+ }\r
+ annotations.add(new Score(method, values));\r
}\r
- seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
- }\r
- } while (line != null);\r
-\r
- infasta.close();\r
- return seqs;\r
- }\r
-\r
- private static float[] convertToNumber(String[] annotValues)\r
- throws UnknownFileFormatException {\r
- float[] annotation = new float[annotValues.length];\r
- try {\r
- for (int i = 0; i < annotation.length; i++) {\r
- annotation[i] = Float.parseFloat(annotValues[i]);\r
- }\r
- } catch (NumberFormatException e) {\r
- throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e\r
- .getCause());\r
+ return annotations;\r
}\r
- return annotation;\r
- }\r
-\r
- private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
- + ">sequence_name\n "\r
- + "M V S\n"\r
- + "0.43 0.22 0.65\n"\r
- + "Where first line is the sequence name,\n"\r
- + "second line is the tab delimited sequence,\n"\r
- + "third line contains tab delimited disorder prediction values.\n"\r
- + "No lines are allowed between these three. Additionally, the number of "\r
- + "sequence residues must be equal to the number of the disorder values.";\r
-\r
- /**\r
- * Closes the Closable and logs the exception if any\r
- * \r
- * @param log\r
- * @param stream\r
- */\r
- public final static void closeSilently(java.util.logging.Logger log,\r
- Closeable stream) {\r
- if (stream != null) {\r
- try {\r
- stream.close();\r
- } catch (IOException e) {\r
- log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
- }\r
+\r
+ /**\r
+ * Reads and parses Fasta or Clustal formatted file into a list of\r
+ * FastaSequence objects\r
+ * \r
+ * @param inFilePath\r
+ * the path to the input file\r
+ * @throws IOException\r
+ * if the file denoted by inFilePath cannot be read\r
+ * @throws UnknownFileFormatException\r
+ * if the inFilePath points to the file which format cannot be\r
+ * recognised\r
+ * @return the List of FastaSequence objects\r
+ * \r
+ */\r
+ public static List<FastaSequence> openInputStream(String inFilePath)\r
+ throws IOException, UnknownFileFormatException {\r
+\r
+ // This stream gets closed in isValidClustalFile method\r
+ InputStream inStrForValidation = new FileInputStream(inFilePath);\r
+ // This stream is closed in the calling methods\r
+ InputStream inStr = new FileInputStream(inFilePath);\r
+ List<FastaSequence> fastaSeqs = null;\r
+ if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
+ Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
+ // alignment cannot be null see\r
+ // ClustalAlignmentUtil.readClustalFile(inStr);\r
+ fastaSeqs = al.getSequences();\r
+ } else {\r
+ fastaSeqs = SequenceUtil.readFasta(inStr);\r
+ }\r
+ return fastaSeqs;\r
}\r
- }\r
-\r
- public static List<AnnotatedSequence> readDisembl(final File result)\r
- throws IOException, UnknownFileFormatException {\r
- InputStream input = new FileInputStream(result);\r
- List<AnnotatedSequence> sequences = readJRonn(input);\r
- input.close();\r
- return sequences;\r
- }\r
+\r
+}\r
+\r
+enum DisemblResult {\r
+ /** These contains ranges and scores */\r
+ COILS, REM465, HOTLOOPS\r
}\r
+enum GlobProtResult {\r
+ /** This a range with no scores */\r
+ GlobDoms,\r
+ /** This a range with no scores */\r
+ Disorder,\r
+ /** This a score with no range */\r
+ Dydx,\r
+ /** This a score with no range */\r
+ SmoothedScore,\r
+ /** This a score with no range */\r
+ RawScore\r
+}
\ No newline at end of file