2 * Copyright (c) 2009 Peter Troshin JAva Bioinformatics Analysis Web Services
\r
3 * (JABAWS) @version: 1.0 This library is free software; you can redistribute it
\r
4 * and/or modify it under the terms of the Apache License version 2 as published
\r
5 * by the Apache Software Foundation This library is distributed in the hope
\r
6 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
\r
7 * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
8 * Apache License for more details. A copy of the license is in
\r
9 * apache_license.txt. It is also available here:
\r
10 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or
\r
11 * derived work distributed in source code form must include this copyright and
\r
15 package compbio.data.sequence;
\r
17 import java.io.BufferedReader;
\r
18 import java.io.File;
\r
19 import java.io.FileInputStream;
\r
20 import java.io.IOException;
\r
21 import java.io.InputStream;
\r
22 import java.io.InputStreamReader;
\r
23 import java.io.OutputStream;
\r
24 import java.io.OutputStreamWriter;
\r
25 import java.io.PrintWriter;
\r
26 import java.util.ArrayList;
\r
27 import java.util.Arrays;
\r
28 import java.util.HashMap;
\r
29 import java.util.List;
\r
30 import java.util.Map;
\r
31 import java.util.StringTokenizer;
\r
32 import java.util.logging.Logger;
\r
35 * Tools to read and write clustal formated files
\r
37 * @author Petr Troshin based on jimp class
\r
39 * Date September 2009
\r
42 public final class ClustalAlignmentUtil {
\r
44 private static final Logger log = Logger
\r
45 .getLogger(ClustalAlignmentUtil.class.getCanonicalName());
\r
48 * Dash char to be used as gap char in the alignments
\r
50 public static final char gapchar = '-';
\r
53 * Number of spaces separating the name and the sequence
\r
55 private static final String spacer = " "; // 6 space characters
\r
57 * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is
\r
58 * longer than that it gets trimmed in the end
\r
60 private static final int maxNameLength = 30; // Maximum name length
\r
62 * If all sequences names in the alignment is shorter than
\r
63 * minNameHolderLength than spaces are added to complete the name up to
\r
64 * minNameHolderLength
\r
66 private static final int minNameHolderLength = 10; // Minimum number of
\r
68 // TODO check whether clustal still loads data if length is 60!
\r
69 private static final int oneLineAlignmentLength = 60; // this could in fact
\r
73 // for long names ~30 chars
\r
76 * Read Clustal formatted alignment. Limitations: Does not read consensus
\r
78 * Sequence names as well as the sequences are not guaranteed to be unique!
\r
80 * @throws {@link IOException}
\r
81 * @throws {@link UnknownFileFormatException}
\r
83 public static Alignment readClustalFile(InputStream instream)
\r
84 throws IOException, UnknownFileFormatException {
\r
86 boolean flag = false;
\r
88 List<String> headers = new ArrayList<String>();
\r
89 Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();
\r
90 FastaSequence[] seqs = null;
\r
94 BufferedReader breader = new BufferedReader(new InputStreamReader(
\r
96 while ((line = breader.readLine()) != null) {
\r
97 if (line.indexOf(" ") != 0) {
\r
98 java.util.StringTokenizer str = new StringTokenizer(line, " ");
\r
101 if (str.hasMoreTokens()) {
\r
102 id = str.nextToken();
\r
103 // PROBCONS output clustal formatted file with not mention
\r
104 // of CLUSTAL (:-))
\r
105 if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {
\r
109 StringBuffer tempseq;
\r
110 if (seqhash.containsKey(id)) {
\r
111 tempseq = seqhash.get(id);
\r
113 tempseq = new StringBuffer();
\r
114 seqhash.put(id, tempseq);
\r
117 if (!(headers.contains(id))) {
\r
121 tempseq.append(str.nextToken());
\r
129 // TODO improve this bit
\r
132 // Add sequences to the hash
\r
133 seqs = new FastaSequence[headers.size()];
\r
134 for (int i = 0; i < headers.size(); i++) {
\r
135 if (seqhash.get(headers.get(i)) != null) {
\r
137 FastaSequence newSeq = new FastaSequence(headers.get(i),
\r
138 seqhash.get(headers.get(i)).toString());
\r
143 // should not happened
\r
144 throw new AssertionError(
\r
145 "Bizarreness! Can't find sequence for "
\r
150 if (seqs == null || seqs.length == 0) {
\r
151 throw new UnknownFileFormatException(
\r
152 "Input does not appear to be a clustal file! ");
\r
154 return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(
\r
155 Program.CLUSTAL, gapchar));
\r
159 * Please note this method closes the input stream provided as a parameter
\r
162 * @return true if the file is recognised as Clustal formatted alignment,
\r
165 public static boolean isValidClustalFile(InputStream input) {
\r
166 if (input == null) {
\r
167 throw new NullPointerException("Input is expected!");
\r
169 BufferedReader breader = new BufferedReader(
\r
170 new InputStreamReader(input));
\r
172 if (input.available() < 10) {
\r
175 // read first 10 lines to find "Clustal"
\r
176 for (int i = 0; i < 10; i++) {
\r
177 String line = breader.readLine();
\r
178 if (line != null) {
\r
179 line = line.toUpperCase().trim();
\r
180 if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {
\r
187 } catch (IOException e) {
\r
188 log.severe("Could not read from the stream! "
\r
189 + e.getLocalizedMessage() + e.getCause());
\r
191 SequenceUtil.closeSilently(log, breader);
\r
197 * Write Clustal formatted alignment Limitations: does not record the
\r
198 * consensus. Potential bug - records 60 chars length alignment where
\r
199 * Clustal would have recorded 50 chars.
\r
204 * @throws IOException
\r
206 public static void writeClustalAlignment(final OutputStream outStream,
\r
207 final Alignment alignment) throws IOException {
\r
208 List<FastaSequence> seqs = alignment.getSequences();
\r
210 PrintWriter out = new PrintWriter(new OutputStreamWriter(outStream));
\r
212 out.write("CLUSTAL\n\n\n");
\r
215 int maxidLength = 0;
\r
218 // Find the longest sequence name
\r
219 for (FastaSequence fs : seqs) {
\r
220 String tmp = fs.getId();
\r
222 if (fs.getSequence().length() > max) {
\r
223 max = fs.getSequence().length();
\r
225 if (tmp.length() > maxidLength) {
\r
226 maxidLength = tmp.length();
\r
230 if (maxidLength < minNameHolderLength) {
\r
231 maxidLength = minNameHolderLength;
\r
233 if (maxidLength > maxNameLength) {
\r
234 maxidLength = 30; // the rest will be trimmed
\r
237 int oneLineAlignmentLength = 60;
\r
238 int nochunks = max / oneLineAlignmentLength + 1;
\r
240 for (i = 0; i < nochunks; i++) {
\r
242 for (FastaSequence fs : seqs) {
\r
244 String name = fs.getId();
\r
245 // display at most 30 characters in the name, keep the names
\r
246 // 6 spaces away from the alignment for longest sequence names,
\r
247 // and more than this for shorter names
\r
248 out.format("%-" + maxidLength + "s" + spacer,
\r
249 (name.length() > maxNameLength ? name.substring(0,
\r
250 maxidLength) : name));
\r
251 int start = i * oneLineAlignmentLength;
\r
252 int end = start + oneLineAlignmentLength;
\r
254 if (end < fs.getSequence().length()
\r
255 && start < fs.getSequence().length()) {
\r
256 out.write(fs.getSequence().substring(start, end) + "\n");
\r
258 if (start < fs.getSequence().length()) {
\r
259 out.write(fs.getSequence().substring(start) + "\n");
\r
269 SequenceUtil.closeSilently(log, out);
\r
273 public static Alignment readClustalFile(File file)
\r
274 throws UnknownFileFormatException, IOException {
\r
275 if (file == null) {
\r
276 throw new NullPointerException("File is expected!");
\r
278 FileInputStream fio = new FileInputStream(file);
\r
279 Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);
\r
283 SequenceUtil.closeSilently(log, fio);
\r