1 /* Copyright (c) 2009 Peter Troshin
\r
3 * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0
\r
5 * This library is free software; you can redistribute it and/or modify it under the terms of the
\r
6 * Apache License version 2 as published by the Apache Software Foundation
\r
8 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
\r
9 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache
\r
10 * License for more details.
\r
12 * A copy of the license is in apache_license.txt. It is also available here:
\r
13 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
\r
15 * Any republication or derived work distributed in source code form
\r
16 * must include this copyright and license notice.
\r
19 package compbio.data.sequence;
\r
21 import java.io.BufferedReader;
\r
22 import java.io.File;
\r
23 import java.io.FileInputStream;
\r
24 import java.io.IOException;
\r
25 import java.io.InputStream;
\r
26 import java.io.InputStreamReader;
\r
27 import java.io.Writer;
\r
28 import java.util.ArrayList;
\r
29 import java.util.Arrays;
\r
30 import java.util.HashMap;
\r
31 import java.util.List;
\r
32 import java.util.Map;
\r
33 import java.util.StringTokenizer;
\r
34 import java.util.logging.Logger;
\r
37 * Tools to read and write clustal formated files
\r
39 * @author Petr Troshin based on jimp class
\r
41 * @version 1.0 September 2009
\r
44 public final class ClustalAlignmentUtil {
\r
46 private static final Logger log = Logger
\r
47 .getLogger(ClustalAlignmentUtil.class.getCanonicalName());
\r
50 * Dash char to be used as gap char in the alignments
\r
52 public static final char gapchar = '-';
\r
55 * Number of spaces separating the name and the sequence
\r
57 private static final String spacer = " "; // 6 space characters
\r
59 * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is
\r
60 * longer than that it gets trimmed in the end
\r
62 private static final int maxNameLength = 30; // Maximum name length
\r
64 * If all sequences names in the alignment is shorter than
\r
65 * minNameHolderLength than spaces are added to complete the name up to
\r
66 * minNameHolderLength
\r
68 private static final int minNameHolderLength = 10; // Minimum number of
\r
70 // TODO check whether clustal still loads data if length is 60!
\r
71 private static final int oneLineAlignmentLength = 60; // this could in fact
\r
75 // for long names ~30 chars
\r
78 * Read Clustal formatted alignment. Limitations: Does not read consensus
\r
80 * Sequence names as well as the sequences are not guaranteed to be unique!
\r
82 * @throws {@link IOException}
\r
83 * @throws {@link UnknownFileFormatException}
\r
85 public static Alignment readClustalFile(InputStream instream)
\r
86 throws IOException, UnknownFileFormatException {
\r
88 boolean flag = false;
\r
90 List<String> headers = new ArrayList<String>();
\r
91 Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();
\r
92 FastaSequence[] seqs = null;
\r
96 BufferedReader breader = new BufferedReader(new InputStreamReader(
\r
98 while ((line = breader.readLine()) != null) {
\r
99 if (line.indexOf(" ") != 0) {
\r
100 java.util.StringTokenizer str = new StringTokenizer(line, " ");
\r
103 if (str.hasMoreTokens()) {
\r
104 id = str.nextToken();
\r
105 // PROBCONS output clustal formatted file with not mention
\r
106 // of CLUSTAL (:-))
\r
107 if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {
\r
111 StringBuffer tempseq;
\r
112 if (seqhash.containsKey(id)) {
\r
113 tempseq = seqhash.get(id);
\r
115 tempseq = new StringBuffer();
\r
116 seqhash.put(id, tempseq);
\r
119 if (!(headers.contains(id))) {
\r
123 tempseq.append(str.nextToken());
\r
131 // TODO improve this bit
\r
134 // Add sequences to the hash
\r
135 seqs = new FastaSequence[headers.size()];
\r
136 for (int i = 0; i < headers.size(); i++) {
\r
137 if (seqhash.get(headers.get(i)) != null) {
\r
139 FastaSequence newSeq = new FastaSequence(headers.get(i),
\r
140 seqhash.get(headers.get(i)).toString());
\r
145 // should not happened
\r
146 throw new AssertionError(
\r
147 "Bizarreness! Can't find sequence for "
\r
152 if (seqs == null || seqs.length == 0) {
\r
153 throw new UnknownFileFormatException(
\r
154 "Input does not appear to be a clustal file! ");
\r
156 return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(
\r
157 Program.CLUSTAL, gapchar));
\r
161 * Please note this method closes the input stream provided as a parameter
\r
164 * @return true if the file is recognised as Clustal formatted alignment,
\r
167 public static boolean isValidClustalFile(InputStream input) {
\r
168 if (input == null) {
\r
169 throw new NullPointerException("Input is expected!");
\r
171 BufferedReader breader = new BufferedReader(
\r
172 new InputStreamReader(input));
\r
174 if (input.available() < 10) {
\r
177 // read first 10 lines to find "Clustal"
\r
178 for (int i = 0; i < 10; i++) {
\r
179 String line = breader.readLine();
\r
180 if (line != null) {
\r
181 line = line.toUpperCase().trim();
\r
182 if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {
\r
189 } catch (IOException e) {
\r
190 log.severe("Could not read from the stream! "
\r
191 + e.getLocalizedMessage() + e.getCause());
\r
193 SequenceUtil.closeSilently(log, breader);
\r
199 * Write Clustal formatted alignment Limitations: does not record the
\r
200 * consensus. Potential bug - records 60 chars length alignment where
\r
201 * Clustal would have recorded 50 chars.
\r
206 * @throws IOException
\r
208 public static void writeClustalAlignment(final Writer out,
\r
209 final Alignment alignment) throws IOException {
\r
210 List<FastaSequence> seqs = alignment.getSequences();
\r
212 out.write("CLUSTAL\n\n\n");
\r
215 int maxidLength = 0;
\r
218 // Find the longest sequence name
\r
219 for (FastaSequence fs : seqs) {
\r
220 String tmp = fs.getId();
\r
222 if (fs.getSequence().length() > max) {
\r
223 max = fs.getSequence().length();
\r
225 if (tmp.length() > maxidLength) {
\r
226 maxidLength = tmp.length();
\r
230 if (maxidLength < minNameHolderLength) {
\r
231 maxidLength = minNameHolderLength;
\r
233 if (maxidLength > maxNameLength) {
\r
234 maxidLength = 30; // the rest will be trimmed
\r
237 int oneLineAlignmentLength = 60;
\r
238 int nochunks = max / oneLineAlignmentLength + 1;
\r
240 for (i = 0; i < nochunks; i++) {
\r
242 for (FastaSequence fs : seqs) {
\r
244 String name = fs.getId();
\r
245 // display at most 30 characters in the name, keep the names
\r
246 // 6 spaces away from the alignment for longest sequence names,
\r
247 // and more than this for shorter names
\r
248 out.write(String.format(
\r
249 "%-" + maxidLength + "s" + spacer,
\r
250 (name.length() > maxNameLength ? name.substring(0,
\r
251 maxidLength) : name)));
\r
252 int start = i * oneLineAlignmentLength;
\r
253 int end = start + oneLineAlignmentLength;
\r
255 if (end < fs.getSequence().length()
\r
256 && start < fs.getSequence().length()) {
\r
257 out.write(fs.getSequence().substring(start, end) + "\n");
\r
259 if (start < fs.getSequence().length()) {
\r
260 out.write(fs.getSequence().substring(start) + "\n");
\r
270 SequenceUtil.closeSilently(log, out);
\r
274 public static Alignment readClustalFile(File file)
\r
275 throws UnknownFileFormatException, IOException {
\r
276 if (file == null) {
\r
277 throw new NullPointerException("File is expected!");
\r
279 FileInputStream fio = new FileInputStream(file);
\r
280 Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);
\r
284 SequenceUtil.closeSilently(log, fio);
\r