-/* Copyright (c) 2009 Peter Troshin\r
+/* \r
+ * @(#)SequenceUtil.java 1.0 September 2009\r
+ * \r
+ * Copyright (c) 2009 Peter Troshin\r
* \r
- * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0\r
+ * Jalview Web Services version: 2.0 \r
* \r
* This library is free software; you can redistribute it and/or modify it under the terms of the\r
* Apache License version 2 as published by the Apache Software Foundation\r
* License for more details.\r
* \r
* A copy of the license is in apache_license.txt. It is also available here:\r
- * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
+ * see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
* \r
* Any republication or derived work distributed in source code form\r
* must include this copyright and license notice.\r
import java.io.BufferedReader;\r
import java.io.BufferedWriter;\r
import java.io.Closeable;\r
+import java.io.File;\r
+import java.io.FileInputStream;\r
import java.io.IOException;\r
import java.io.InputStream;\r
import java.io.InputStreamReader;\r
/**\r
* Utility class for operations on sequences\r
* \r
- * @author pvtroshin\r
- * \r
- * Date September 2009\r
+ * @author Petr Troshin\r
+ * @version 1.0\r
*/\r
public final class SequenceUtil {\r
\r
/**\r
* @return true is the sequence contains only letters a,c, t, g, u\r
*/\r
- public static boolean isNucleotideSequence(FastaSequence s) {\r
- return isNonAmbNucleotideSequence(s.getSequence());\r
+ public static boolean isNucleotideSequence(final FastaSequence s) {\r
+ return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
}\r
\r
/**\r
* (!) - B char\r
*/\r
public static boolean isNonAmbNucleotideSequence(String sequence) {\r
- sequence = cleanSequence(sequence);\r
- if (DIGIT.matcher(sequence).find()) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
return false;\r
}\r
- if (NON_NUCLEOTIDE.matcher(sequence).find()) {\r
+ if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
return false;\r
/*\r
* System.out.format("I found the text starting at " +\r
* nonDNAmatcher.end());\r
*/\r
}\r
- Matcher DNAmatcher = NUCLEOTIDE.matcher(sequence);\r
+ final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
return DNAmatcher.find();\r
}\r
\r
*/\r
public static String cleanSequence(String sequence) {\r
assert sequence != null;\r
- final Matcher m = WHITE_SPACE.matcher(sequence);\r
+ final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
sequence = m.replaceAll("").toUpperCase();\r
return sequence;\r
}\r
* @return cleaned up sequence\r
*/\r
public static String deepCleanSequence(String sequence) {\r
- sequence = cleanSequence(sequence);\r
- sequence = DIGIT.matcher(sequence).replaceAll("");\r
- sequence = NONWORD.matcher(sequence).replaceAll("");\r
- Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
+ sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
+ final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
return sequence;\r
}\r
* @return true is the sequence is a protein sequence, false overwise\r
*/\r
public static boolean isProteinSequence(String sequence) {\r
- sequence = cleanSequence(sequence);\r
- if (isNonAmbNucleotideSequence(sequence)) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
return false;\r
}\r
- if (DIGIT.matcher(sequence).find()) {\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
return false;\r
}\r
- if (NON_AA.matcher(sequence).find()) {\r
+ if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
return false;\r
}\r
- Matcher protmatcher = AA.matcher(sequence);\r
+ final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
return protmatcher.find();\r
}\r
\r
* protein or DNA\r
*/\r
public static boolean isAmbiguosProtein(String sequence) {\r
- sequence = cleanSequence(sequence);\r
- if (isNonAmbNucleotideSequence(sequence)) {\r
+ sequence = SequenceUtil.cleanSequence(sequence);\r
+ if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
return false;\r
}\r
- if (DIGIT.matcher(sequence).find()) {\r
+ if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
return false;\r
}\r
- if (NON_AA.matcher(sequence).find()) {\r
+ if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
return false;\r
}\r
- if (AA.matcher(sequence).find()) {\r
+ if (SequenceUtil.AA.matcher(sequence).find()) {\r
return false;\r
}\r
- Matcher amb_prot = AMBIGUOUS_AA.matcher(sequence);\r
+ final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
return amb_prot.find();\r
}\r
\r
* - the maximum number of characters to write in one line\r
* @throws IOException\r
*/\r
- public static void writeFasta(OutputStream outstream,\r
- List<FastaSequence> sequences, int width) throws IOException {\r
- OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
- BufferedWriter fastawriter = new BufferedWriter(writer);\r
- for (FastaSequence fs : sequences) {\r
- fastawriter.write(fs.getOnelineFasta());\r
+ public static void writeFasta(final OutputStream outstream,\r
+ final List<FastaSequence> sequences, final int width)\r
+ throws IOException {\r
+ final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
+ final BufferedWriter fastawriter = new BufferedWriter(writer);\r
+ for (final FastaSequence fs : sequences) {\r
+ fastawriter.write(fs.getFormatedSequence(width));\r
}\r
outstream.flush();\r
fastawriter.close();\r
* @return list of FastaSequence objects\r
* @throws IOException\r
*/\r
- public static List<FastaSequence> readFasta(InputStream inStream)\r
+ public static List<FastaSequence> readFasta(final InputStream inStream)\r
throws IOException {\r
- List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
- InputStreamReader inReader = new InputStreamReader(inStream);\r
- BufferedReader infasta = new BufferedReader(inReader);\r
- Pattern pattern = Pattern.compile("//s+");\r
+ final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
+\r
+ final BufferedReader infasta = new BufferedReader(\r
+ new InputStreamReader(inStream, "UTF8"), 16000);\r
+ final Pattern pattern = Pattern.compile("//s+");\r
\r
String line;\r
String sname = "", seqstr = null;\r
do {\r
line = infasta.readLine();\r
- if (line == null || line.startsWith(">")) {\r
- if (seqstr != null)\r
+ if ((line == null) || line.startsWith(">")) {\r
+ if (seqstr != null) {\r
seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
+ }\r
sname = line; // remove >\r
seqstr = "";\r
} else {\r
- String subseq = pattern.matcher(line).replaceAll("");\r
+ final String subseq = pattern.matcher(line).replaceAll("");\r
seqstr += subseq;\r
}\r
} while (line != null);\r
- inReader.close();\r
+\r
infasta.close();\r
return seqs;\r
}\r
* @param sequences\r
* @throws IOException\r
*/\r
- public static void writeFasta(OutputStream os, List<FastaSequence> sequences)\r
- throws IOException {\r
- OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
- BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
- for (FastaSequence fs : sequences) {\r
+ public static void writeFasta(final OutputStream os,\r
+ final List<FastaSequence> sequences) throws IOException {\r
+ final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
+ final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
+ for (final FastaSequence fs : sequences) {\r
fasta_out.write(fs.getOnelineFasta());\r
}\r
fasta_out.close();\r
outWriter.close();\r
}\r
\r
+ public static List<AnnotatedSequence> readJRonn(final File result)\r
+ throws IOException, UnknownFileFormatException {\r
+ InputStream input = new FileInputStream(result);\r
+ List<AnnotatedSequence> sequences = readJRonn(input);\r
+ input.close();\r
+ return sequences;\r
+ }\r
+\r
+ /**\r
+ * Reader for JRonn horizontal file format\r
+ * \r
+ * >Foobar\r
+ * \r
+ * M G D T T A G\r
+ * \r
+ * 0.48 0.42 0.42 0.48 0.52 0.53 0.54\r
+ * \r
+ * All values are tab delimited\r
+ * \r
+ * @param inStream\r
+ * @return\r
+ * @throws IOException\r
+ * @throws UnknownFileFormatException\r
+ */\r
+ public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
+ throws IOException, UnknownFileFormatException {\r
+ final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
+\r
+ final BufferedReader infasta = new BufferedReader(\r
+ new InputStreamReader(inStream, "UTF8"), 16000);\r
+\r
+ String line;\r
+ String sname = "";\r
+ do {\r
+ line = infasta.readLine();\r
+ if (line == null || line.isEmpty()) {\r
+ // skip empty lines\r
+ continue;\r
+ }\r
+ if (line.startsWith(">")) {\r
+ // read name\r
+ sname = line.trim().substring(1);\r
+ // read sequence line\r
+ line = infasta.readLine();\r
+ final String sequence = line.replace("\t", "");\r
+ // read annotation line\r
+ line = infasta.readLine();\r
+ String[] annotValues = line.split("\t");\r
+ float[] annotation = convertToNumber(annotValues);\r
+ if (annotation.length != sequence.length()) {\r
+ throw new UnknownFileFormatException(\r
+ "File does not look like Jronn horizontally formatted output file!\n"\r
+ + JRONN_WRONG_FORMAT_MESSAGE);\r
+ }\r
+ seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
+ }\r
+ } while (line != null);\r
+\r
+ infasta.close();\r
+ return seqs;\r
+ }\r
+\r
+ private static float[] convertToNumber(String[] annotValues)\r
+ throws UnknownFileFormatException {\r
+ float[] annotation = new float[annotValues.length];\r
+ try {\r
+ for (int i = 0; i < annotation.length; i++) {\r
+ annotation[i] = Float.parseFloat(annotValues[i]);\r
+ }\r
+ } catch (NumberFormatException e) {\r
+ throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e\r
+ .getCause());\r
+ }\r
+ return annotation;\r
+ }\r
+\r
+ private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
+ + ">sequence_name\n "\r
+ + "M V S\n"\r
+ + "0.43 0.22 0.65\n"\r
+ + "Where first line is the sequence name,\n"\r
+ + "second line is the tab delimited sequence,\n"\r
+ + "third line contains tab delimited disorder prediction values.\n"\r
+ + "No lines are allowed between these three. Additionally, the number of "\r
+ + "sequence residues must be equal to the number of the disorder values.";\r
+\r
/**\r
* Closes the Closable and logs the exception if any\r
* \r