import java.io.IOException;\r
import java.io.InputStream;\r
import java.io.InputStreamReader;\r
-import java.io.OutputStream;\r
-import java.io.OutputStreamWriter;\r
-import java.io.PrintWriter;\r
+import java.io.Writer;\r
import java.util.ArrayList;\r
import java.util.Arrays;\r
import java.util.HashMap;\r
* \r
* @author Petr Troshin based on jimp class\r
* \r
- * Date September 2009\r
+ * @version 1.0 September 2009\r
* \r
*/\r
public final class ClustalAlignmentUtil {\r
\r
- private static final Logger log = Logger\r
- .getLogger(ClustalAlignmentUtil.class.getCanonicalName());\r
-\r
- /**\r
- * Dash char to be used as gap char in the alignments\r
- */\r
- public static final char gapchar = '-';\r
-\r
- /*\r
- * Number of spaces separating the name and the sequence\r
- */\r
- private static final String spacer = " "; // 6 space characters\r
- /*\r
- * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is\r
- * longer than that it gets trimmed in the end\r
- */\r
- private static final int maxNameLength = 30; // Maximum name length\r
- /*\r
- * If all sequences names in the alignment is shorter than\r
- * minNameHolderLength than spaces are added to complete the name up to\r
- * minNameHolderLength\r
- */\r
- private static final int minNameHolderLength = 10; // Minimum number of\r
-\r
- // TODO check whether clustal still loads data if length is 60!\r
- private static final int oneLineAlignmentLength = 60; // this could in fact\r
-\r
- // be 50\r
-\r
- // for long names ~30 chars\r
-\r
- /**\r
- * Read Clustal formatted alignment. Limitations: Does not read consensus\r
- * \r
- * Sequence names as well as the sequences are not guaranteed to be unique!\r
- * \r
- * @throws {@link IOException}\r
- * @throws {@link UnknownFileFormatException}\r
- */\r
- public static Alignment readClustalFile(InputStream instream)\r
- throws IOException, UnknownFileFormatException {\r
-\r
- boolean flag = false;\r
-\r
- List<String> headers = new ArrayList<String>();\r
- Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();\r
- FastaSequence[] seqs = null;\r
-\r
- String line;\r
-\r
- BufferedReader breader = new BufferedReader(new InputStreamReader(\r
- instream));\r
- while ((line = breader.readLine()) != null) {\r
- if (line.indexOf(" ") != 0) {\r
- java.util.StringTokenizer str = new StringTokenizer(line, " ");\r
- String id = "";\r
-\r
- if (str.hasMoreTokens()) {\r
- id = str.nextToken();\r
- // PROBCONS output clustal formatted file with not mention\r
- // of CLUSTAL (:-))\r
- if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {\r
- flag = true;\r
- } else {\r
- if (flag) {\r
- StringBuffer tempseq;\r
- if (seqhash.containsKey(id)) {\r
- tempseq = seqhash.get(id);\r
- } else {\r
- tempseq = new StringBuffer();\r
- seqhash.put(id, tempseq);\r
- }\r
-\r
- if (!(headers.contains(id))) {\r
- headers.add(id);\r
- }\r
-\r
- tempseq.append(str.nextToken());\r
+ private static final Logger log = Logger\r
+ .getLogger(ClustalAlignmentUtil.class.getCanonicalName());\r
+\r
+ /**\r
+ * Dash char to be used as gap char in the alignments\r
+ */\r
+ public static final char gapchar = '-';\r
+\r
+ /*\r
+ * Number of spaces separating the name and the sequence\r
+ */\r
+ private static final String spacer = " "; // 6 space characters\r
+ /*\r
+ * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is\r
+ * longer than that it gets trimmed in the end\r
+ */\r
+ private static final int maxNameLength = 30; // Maximum name length\r
+ /*\r
+ * If all sequences names in the alignment is shorter than\r
+ * minNameHolderLength than spaces are added to complete the name up to\r
+ * minNameHolderLength\r
+ */\r
+ private static final int minNameHolderLength = 10; // Minimum number of\r
+\r
+ // TODO check whether clustal still loads data if length is 60!\r
+ private static final int oneLineAlignmentLength = 60; // this could in fact\r
+\r
+ // be 50\r
+\r
+ // for long names ~30 chars\r
+\r
+ /**\r
+ * Read Clustal formatted alignment. Limitations: Does not read consensus\r
+ * \r
+ * Sequence names as well as the sequences are not guaranteed to be unique!\r
+ * \r
+ * @throws {@link IOException}\r
+ * @throws {@link UnknownFileFormatException}\r
+ */\r
+ public static Alignment readClustalFile(InputStream instream)\r
+ throws IOException, UnknownFileFormatException {\r
+\r
+ boolean flag = false;\r
+\r
+ List<String> headers = new ArrayList<String>();\r
+ Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();\r
+ FastaSequence[] seqs = null;\r
+\r
+ String line;\r
+\r
+ BufferedReader breader = new BufferedReader(new InputStreamReader(\r
+ instream));\r
+ while ((line = breader.readLine()) != null) {\r
+ if (line.indexOf(" ") != 0) {\r
+ java.util.StringTokenizer str = new StringTokenizer(line, " ");\r
+ String id = "";\r
+\r
+ if (str.hasMoreTokens()) {\r
+ id = str.nextToken();\r
+ // PROBCONS output clustal formatted file with not mention\r
+ // of CLUSTAL (:-))\r
+ if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {\r
+ flag = true;\r
+ } else {\r
+ if (flag) {\r
+ StringBuffer tempseq;\r
+ if (seqhash.containsKey(id)) {\r
+ tempseq = seqhash.get(id);\r
+ } else {\r
+ tempseq = new StringBuffer();\r
+ seqhash.put(id, tempseq);\r
+ }\r
+\r
+ if (!(headers.contains(id))) {\r
+ headers.add(id);\r
+ }\r
+\r
+ tempseq.append(str.nextToken());\r
+ }\r
+ }\r
+ }\r
}\r
- }\r
}\r
- }\r
- }\r
- breader.close();\r
+ breader.close();\r
\r
- // TODO improve this bit\r
- if (flag) {\r
+ // TODO improve this bit\r
+ if (flag) {\r
\r
- // Add sequences to the hash\r
- seqs = new FastaSequence[headers.size()];\r
- for (int i = 0; i < headers.size(); i++) {\r
- if (seqhash.get(headers.get(i)) != null) {\r
+ // Add sequences to the hash\r
+ seqs = new FastaSequence[headers.size()];\r
+ for (int i = 0; i < headers.size(); i++) {\r
+ if (seqhash.get(headers.get(i)) != null) {\r
\r
- FastaSequence newSeq = new FastaSequence(headers.get(i),\r
- seqhash.get(headers.get(i)).toString());\r
+ FastaSequence newSeq = new FastaSequence(headers.get(i),\r
+ seqhash.get(headers.get(i)).toString());\r
\r
- seqs[i] = newSeq;\r
+ seqs[i] = newSeq;\r
\r
- } else {\r
- // should not happened\r
- throw new AssertionError(\r
- "Bizarreness! Can't find sequence for "\r
- + headers.get(i));\r
+ } else {\r
+ // should not happened\r
+ throw new AssertionError(\r
+ "Bizarreness! Can't find sequence for "\r
+ + headers.get(i));\r
+ }\r
+ }\r
}\r
- }\r
- }\r
- if (seqs == null || seqs.length == 0) {\r
- throw new UnknownFileFormatException(\r
- "Input does not appear to be a clustal file! ");\r
- }\r
- return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(\r
- Program.CLUSTAL, gapchar));\r
- }\r
-\r
- /**\r
- * \r
- * @param input\r
- * @return true if the file is recognised as Clustal formatted alignment,\r
- * false otherwise\r
- */\r
- public static boolean isValidClustalFile(InputStream input) {\r
- if (input == null) {\r
- throw new NullPointerException("Input is expected!");\r
- }\r
- BufferedReader breader = new BufferedReader(\r
- new InputStreamReader(input));\r
- try {\r
- if (input.available() < 10) {\r
- return false;\r
- }\r
- // read first 10 lines to find "Clustal"\r
- for (int i = 0; i < 10; i++) {\r
- String line = breader.readLine();\r
- if (line != null) {\r
- line = line.toUpperCase().trim();\r
- if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {\r
- return true;\r
- }\r
+ if (seqs == null || seqs.length == 0) {\r
+ throw new UnknownFileFormatException(\r
+ "Input does not appear to be a clustal file! ");\r
}\r
- }\r
-\r
- breader.close();\r
- } catch (IOException e) {\r
- log.severe("Could not read from the stream! "\r
- + e.getLocalizedMessage() + e.getCause());\r
- } finally {\r
- SequenceUtil.closeSilently(log, breader);\r
- }\r
- return false;\r
- }\r
-\r
- /**\r
- * Write Clustal formatted alignment Limitations: does not record the\r
- * consensus. Potential bug - records 60 chars length alignment where\r
- * Clustal would have recorded 50 chars.\r
- * \r
- * @param outStream\r
- * \r
- * @param alignment\r
- * @throws IOException\r
- */\r
- public static void writeClustalAlignment(final OutputStream outStream,\r
- final Alignment alignment) throws IOException {\r
- List<FastaSequence> seqs = alignment.getSequences();\r
-\r
- PrintWriter out = new PrintWriter(new OutputStreamWriter(outStream));\r
-\r
- out.write("CLUSTAL\n\n\n");\r
-\r
- int max = 0;\r
- int maxidLength = 0;\r
-\r
- int i = 0;\r
- // Find the longest sequence name\r
- for (FastaSequence fs : seqs) {\r
- String tmp = fs.getId();\r
-\r
- if (fs.getSequence().length() > max) {\r
- max = fs.getSequence().length();\r
- }\r
- if (tmp.length() > maxidLength) {\r
- maxidLength = tmp.length();\r
- }\r
- i++;\r
- }\r
- if (maxidLength < minNameHolderLength) {\r
- maxidLength = minNameHolderLength;\r
- }\r
- if (maxidLength > maxNameLength) {\r
- maxidLength = 30; // the rest will be trimmed\r
+ return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(\r
+ Program.CLUSTAL, gapchar));\r
}\r
\r
- int oneLineAlignmentLength = 60;\r
- int nochunks = max / oneLineAlignmentLength + 1;\r
-\r
- for (i = 0; i < nochunks; i++) {\r
- int j = 0;\r
- for (FastaSequence fs : seqs) {\r
-\r
- String name = fs.getId();\r
- // display at most 30 characters in the name, keep the names\r
- // 6 spaces away from the alignment for longest sequence names,\r
- // and more than this for shorter names\r
- out.format("%-" + maxidLength + "s" + spacer,\r
- (name.length() > maxNameLength ? name.substring(0,\r
- maxidLength) : name));\r
- int start = i * oneLineAlignmentLength;\r
- int end = start + oneLineAlignmentLength;\r
-\r
- if (end < fs.getSequence().length()\r
- && start < fs.getSequence().length()) {\r
- out.write(fs.getSequence().substring(start, end) + "\n");\r
- } else {\r
- if (start < fs.getSequence().length()) {\r
- out.write(fs.getSequence().substring(start) + "\n");\r
- }\r
+ /**\r
+ * Please note this method closes the input stream provided as a parameter\r
+ * \r
+ * @param input\r
+ * @return true if the file is recognised as Clustal formatted alignment,\r
+ * false otherwise\r
+ */\r
+ public static boolean isValidClustalFile(InputStream input) {\r
+ if (input == null) {\r
+ throw new NullPointerException("Input is expected!");\r
}\r
- j++;\r
- }\r
- out.write("\n");\r
- }\r
- try {\r
- out.close();\r
- } finally {\r
- SequenceUtil.closeSilently(log, out);\r
+ BufferedReader breader = new BufferedReader(\r
+ new InputStreamReader(input));\r
+ try {\r
+ if (input.available() < 10) {\r
+ return false;\r
+ }\r
+ // read first 10 lines to find "Clustal"\r
+ for (int i = 0; i < 10; i++) {\r
+ String line = breader.readLine();\r
+ if (line != null) {\r
+ line = line.toUpperCase().trim();\r
+ if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {\r
+ return true;\r
+ }\r
+ }\r
+ }\r
+\r
+ breader.close();\r
+ } catch (IOException e) {\r
+ log.severe("Could not read from the stream! "\r
+ + e.getLocalizedMessage() + e.getCause());\r
+ } finally {\r
+ SequenceUtil.closeSilently(log, breader);\r
+ }\r
+ return false;\r
}\r
- }\r
\r
- public static Alignment readClustalFile(File file)\r
- throws UnknownFileFormatException, IOException {\r
- if (file == null) {\r
- throw new NullPointerException("File is expected!");\r
+ /**\r
+ * Write Clustal formatted alignment Limitations: does not record the\r
+ * consensus. Potential bug - records 60 chars length alignment where\r
+ * Clustal would have recorded 50 chars.\r
+ * \r
+ * @param out\r
+ * \r
+ * @param alignment\r
+ * @throws IOException\r
+ */\r
+ public static void writeClustalAlignment(final Writer out,\r
+ final Alignment alignment) throws IOException {\r
+ List<FastaSequence> seqs = alignment.getSequences();\r
+\r
+ out.write("CLUSTAL\n\n\n");\r
+\r
+ int max = 0;\r
+ int maxidLength = 0;\r
+\r
+ int i = 0;\r
+ // Find the longest sequence name\r
+ for (FastaSequence fs : seqs) {\r
+ String tmp = fs.getId();\r
+\r
+ if (fs.getSequence().length() > max) {\r
+ max = fs.getSequence().length();\r
+ }\r
+ if (tmp.length() > maxidLength) {\r
+ maxidLength = tmp.length();\r
+ }\r
+ i++;\r
+ }\r
+ if (maxidLength < minNameHolderLength) {\r
+ maxidLength = minNameHolderLength;\r
+ }\r
+ if (maxidLength > maxNameLength) {\r
+ maxidLength = 30; // the rest will be trimmed\r
+ }\r
+\r
+ int oneLineAlignmentLength = 60;\r
+ int nochunks = max / oneLineAlignmentLength + 1;\r
+\r
+ for (i = 0; i < nochunks; i++) {\r
+ int j = 0;\r
+ for (FastaSequence fs : seqs) {\r
+\r
+ String name = fs.getId();\r
+ // display at most 30 characters in the name, keep the names\r
+ // 6 spaces away from the alignment for longest sequence names,\r
+ // and more than this for shorter names\r
+ out.write(String.format(\r
+ "%-" + maxidLength + "s" + spacer,\r
+ (name.length() > maxNameLength ? name.substring(0,\r
+ maxidLength) : name)));\r
+ int start = i * oneLineAlignmentLength;\r
+ int end = start + oneLineAlignmentLength;\r
+\r
+ if (end < fs.getSequence().length()\r
+ && start < fs.getSequence().length()) {\r
+ out.write(fs.getSequence().substring(start, end) + "\n");\r
+ } else {\r
+ if (start < fs.getSequence().length()) {\r
+ out.write(fs.getSequence().substring(start) + "\n");\r
+ }\r
+ }\r
+ j++;\r
+ }\r
+ out.write("\n");\r
+ }\r
+ try {\r
+ out.close();\r
+ } finally {\r
+ SequenceUtil.closeSilently(log, out);\r
+ }\r
}\r
- FileInputStream fio = new FileInputStream(file);\r
- Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);\r
- try {\r
- fio.close();\r
- } finally {\r
- SequenceUtil.closeSilently(log, fio);\r
+\r
+ public static Alignment readClustalFile(File file)\r
+ throws UnknownFileFormatException, IOException {\r
+ if (file == null) {\r
+ throw new NullPointerException("File is expected!");\r
+ }\r
+ FileInputStream fio = new FileInputStream(file);\r
+ Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);\r
+ try {\r
+ fio.close();\r
+ } finally {\r
+ SequenceUtil.closeSilently(log, fio);\r
+ }\r
+ return seqAl;\r
}\r
- return seqAl;\r
- }\r
}\r