1 /* Copyright (c) 2009 Peter Troshin
\r
3 * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0
\r
5 * This library is free software; you can redistribute it and/or modify it under the terms of the
\r
6 * Apache License version 2 as published by the Apache Software Foundation
\r
8 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
\r
9 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache
\r
10 * License for more details.
\r
12 * A copy of the license is in apache_license.txt. It is also available here:
\r
13 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
\r
15 * Any republication or derived work distributed in source code form
\r
16 * must include this copyright and license notice.
\r
19 package compbio.data.sequence;
\r
21 import java.io.BufferedReader;
\r
22 import java.io.File;
\r
23 import java.io.FileInputStream;
\r
24 import java.io.IOException;
\r
25 import java.io.InputStream;
\r
26 import java.io.InputStreamReader;
\r
27 import java.io.OutputStream;
\r
28 import java.io.OutputStreamWriter;
\r
29 import java.io.PrintWriter;
\r
30 import java.util.ArrayList;
\r
31 import java.util.Arrays;
\r
32 import java.util.HashMap;
\r
33 import java.util.List;
\r
34 import java.util.Map;
\r
35 import java.util.StringTokenizer;
\r
36 import java.util.logging.Logger;
\r
39 * Tools to read and write clustal formated files
\r
41 * @author Petr Troshin based on jimp class
\r
43 * @version 1.0 September 2009
\r
46 public final class ClustalAlignmentUtil {
\r
48 private static final Logger log = Logger
\r
49 .getLogger(ClustalAlignmentUtil.class.getCanonicalName());
\r
52 * Dash char to be used as gap char in the alignments
\r
54 public static final char gapchar = '-';
\r
57 * Number of spaces separating the name and the sequence
\r
59 private static final String spacer = " "; // 6 space characters
\r
61 * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is
\r
62 * longer than that it gets trimmed in the end
\r
64 private static final int maxNameLength = 30; // Maximum name length
\r
66 * If all sequences names in the alignment is shorter than
\r
67 * minNameHolderLength than spaces are added to complete the name up to
\r
68 * minNameHolderLength
\r
70 private static final int minNameHolderLength = 10; // Minimum number of
\r
72 // TODO check whether clustal still loads data if length is 60!
\r
73 private static final int oneLineAlignmentLength = 60; // this could in fact
\r
77 // for long names ~30 chars
\r
80 * Read Clustal formatted alignment. Limitations: Does not read consensus
\r
82 * Sequence names as well as the sequences are not guaranteed to be unique!
\r
84 * @throws {@link IOException}
\r
85 * @throws {@link UnknownFileFormatException}
\r
87 public static Alignment readClustalFile(InputStream instream)
\r
88 throws IOException, UnknownFileFormatException {
\r
90 boolean flag = false;
\r
92 List<String> headers = new ArrayList<String>();
\r
93 Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();
\r
94 FastaSequence[] seqs = null;
\r
98 BufferedReader breader = new BufferedReader(new InputStreamReader(
\r
100 while ((line = breader.readLine()) != null) {
\r
101 if (line.indexOf(" ") != 0) {
\r
102 java.util.StringTokenizer str = new StringTokenizer(line, " ");
\r
105 if (str.hasMoreTokens()) {
\r
106 id = str.nextToken();
\r
107 // PROBCONS output clustal formatted file with not mention
\r
108 // of CLUSTAL (:-))
\r
109 if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {
\r
113 StringBuffer tempseq;
\r
114 if (seqhash.containsKey(id)) {
\r
115 tempseq = seqhash.get(id);
\r
117 tempseq = new StringBuffer();
\r
118 seqhash.put(id, tempseq);
\r
121 if (!(headers.contains(id))) {
\r
125 tempseq.append(str.nextToken());
\r
133 // TODO improve this bit
\r
136 // Add sequences to the hash
\r
137 seqs = new FastaSequence[headers.size()];
\r
138 for (int i = 0; i < headers.size(); i++) {
\r
139 if (seqhash.get(headers.get(i)) != null) {
\r
141 FastaSequence newSeq = new FastaSequence(headers.get(i),
\r
142 seqhash.get(headers.get(i)).toString());
\r
147 // should not happened
\r
148 throw new AssertionError(
\r
149 "Bizarreness! Can't find sequence for "
\r
154 if (seqs == null || seqs.length == 0) {
\r
155 throw new UnknownFileFormatException(
\r
156 "Input does not appear to be a clustal file! ");
\r
158 return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(
\r
159 Program.CLUSTAL, gapchar));
\r
163 * Please note this method closes the input stream provided as a parameter
\r
166 * @return true if the file is recognised as Clustal formatted alignment,
\r
169 public static boolean isValidClustalFile(InputStream input) {
\r
170 if (input == null) {
\r
171 throw new NullPointerException("Input is expected!");
\r
173 BufferedReader breader = new BufferedReader(
\r
174 new InputStreamReader(input));
\r
176 if (input.available() < 10) {
\r
179 // read first 10 lines to find "Clustal"
\r
180 for (int i = 0; i < 10; i++) {
\r
181 String line = breader.readLine();
\r
182 if (line != null) {
\r
183 line = line.toUpperCase().trim();
\r
184 if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {
\r
191 } catch (IOException e) {
\r
192 log.severe("Could not read from the stream! "
\r
193 + e.getLocalizedMessage() + e.getCause());
\r
195 SequenceUtil.closeSilently(log, breader);
\r
201 * Write Clustal formatted alignment Limitations: does not record the
\r
202 * consensus. Potential bug - records 60 chars length alignment where
\r
203 * Clustal would have recorded 50 chars.
\r
208 * @throws IOException
\r
210 public static void writeClustalAlignment(final OutputStream outStream,
\r
211 final Alignment alignment) throws IOException {
\r
212 List<FastaSequence> seqs = alignment.getSequences();
\r
214 PrintWriter out = new PrintWriter(new OutputStreamWriter(outStream));
\r
216 out.write("CLUSTAL\n\n\n");
\r
219 int maxidLength = 0;
\r
222 // Find the longest sequence name
\r
223 for (FastaSequence fs : seqs) {
\r
224 String tmp = fs.getId();
\r
226 if (fs.getSequence().length() > max) {
\r
227 max = fs.getSequence().length();
\r
229 if (tmp.length() > maxidLength) {
\r
230 maxidLength = tmp.length();
\r
234 if (maxidLength < minNameHolderLength) {
\r
235 maxidLength = minNameHolderLength;
\r
237 if (maxidLength > maxNameLength) {
\r
238 maxidLength = 30; // the rest will be trimmed
\r
241 int oneLineAlignmentLength = 60;
\r
242 int nochunks = max / oneLineAlignmentLength + 1;
\r
244 for (i = 0; i < nochunks; i++) {
\r
246 for (FastaSequence fs : seqs) {
\r
248 String name = fs.getId();
\r
249 // display at most 30 characters in the name, keep the names
\r
250 // 6 spaces away from the alignment for longest sequence names,
\r
251 // and more than this for shorter names
\r
253 "%-" + maxidLength + "s" + spacer,
\r
254 (name.length() > maxNameLength ? name.substring(0,
\r
255 maxidLength) : name));
\r
256 int start = i * oneLineAlignmentLength;
\r
257 int end = start + oneLineAlignmentLength;
\r
259 if (end < fs.getSequence().length()
\r
260 && start < fs.getSequence().length()) {
\r
261 out.write(fs.getSequence().substring(start, end) + "\n");
\r
263 if (start < fs.getSequence().length()) {
\r
264 out.write(fs.getSequence().substring(start) + "\n");
\r
274 SequenceUtil.closeSilently(log, out);
\r
278 public static Alignment readClustalFile(File file)
\r
279 throws UnknownFileFormatException, IOException {
\r
280 if (file == null) {
\r
281 throw new NullPointerException("File is expected!");
\r
283 FileInputStream fio = new FileInputStream(file);
\r
284 Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);
\r
288 SequenceUtil.closeSilently(log, fio);
\r