2656046ee8c1381e4164bc7cb108544cc6b16506
[jabaws.git] / datamodel / compbio / data / sequence / ClustalAlignmentUtil.java
1 /* Copyright (c) 2009 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0 \r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 \r
19 package compbio.data.sequence;\r
20 \r
21 import java.io.BufferedReader;\r
22 import java.io.File;\r
23 import java.io.FileInputStream;\r
24 import java.io.IOException;\r
25 import java.io.InputStream;\r
26 import java.io.InputStreamReader;\r
27 import java.io.Writer;\r
28 import java.util.ArrayList;\r
29 import java.util.Arrays;\r
30 import java.util.HashMap;\r
31 import java.util.List;\r
32 import java.util.Map;\r
33 import java.util.StringTokenizer;\r
34 import java.util.logging.Logger;\r
35 \r
36 /**\r
37  * Tools to read and write clustal formated files\r
38  * \r
39  * @author Petr Troshin based on jimp class\r
40  * \r
41  * @version 1.0 September 2009\r
42  * \r
43  */\r
44 public final class ClustalAlignmentUtil {\r
45 \r
46         private static final Logger log = Logger\r
47                         .getLogger(ClustalAlignmentUtil.class.getCanonicalName());\r
48 \r
49         /**\r
50          * Dash char to be used as gap char in the alignments\r
51          */\r
52         public static final char gapchar = '-';\r
53 \r
54         /*\r
55          * Number of spaces separating the name and the sequence\r
56          */\r
57         private static final String spacer = "      "; // 6 space characters\r
58         /*\r
59          * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is\r
60          * longer than that it gets trimmed in the end\r
61          */\r
62         private static final int maxNameLength = 30; // Maximum name length\r
63         /*\r
64          * If all sequences names in the alignment is shorter than\r
65          * minNameHolderLength than spaces are added to complete the name up to\r
66          * minNameHolderLength\r
67          */\r
68         private static final int minNameHolderLength = 10; // Minimum number of\r
69 \r
70         // TODO check whether clustal still loads data if length is 60!\r
71         private static final int oneLineAlignmentLength = 60; // this could in fact\r
72 \r
73         // be 50\r
74 \r
75         // for long names ~30 chars\r
76 \r
77         /**\r
78          * Read Clustal formatted alignment. Limitations: Does not read consensus\r
79          * \r
80          * Sequence names as well as the sequences are not guaranteed to be unique!\r
81          * \r
82          * @throws {@link IOException}\r
83          * @throws {@link UnknownFileFormatException}\r
84          */\r
85         public static Alignment readClustalFile(InputStream instream)\r
86                         throws IOException, UnknownFileFormatException {\r
87 \r
88                 boolean flag = false;\r
89 \r
90                 List<String> headers = new ArrayList<String>();\r
91                 Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();\r
92                 FastaSequence[] seqs = null;\r
93 \r
94                 String line;\r
95 \r
96                 BufferedReader breader = new BufferedReader(new InputStreamReader(\r
97                                 instream));\r
98                 while ((line = breader.readLine()) != null) {\r
99                         if (line.indexOf(" ") != 0) {\r
100                                 java.util.StringTokenizer str = new StringTokenizer(line, " ");\r
101                                 String id = "";\r
102 \r
103                                 if (str.hasMoreTokens()) {\r
104                                         id = str.nextToken();\r
105                                         // PROBCONS output clustal formatted file with not mention\r
106                                         // of CLUSTAL (:-))\r
107                                         if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {\r
108                                                 flag = true;\r
109                                         } else {\r
110                                                 if (flag) {\r
111                                                         StringBuffer tempseq;\r
112                                                         if (seqhash.containsKey(id)) {\r
113                                                                 tempseq = seqhash.get(id);\r
114                                                         } else {\r
115                                                                 tempseq = new StringBuffer();\r
116                                                                 seqhash.put(id, tempseq);\r
117                                                         }\r
118 \r
119                                                         if (!(headers.contains(id))) {\r
120                                                                 headers.add(id);\r
121                                                         }\r
122 \r
123                                                         tempseq.append(str.nextToken());\r
124                                                 }\r
125                                         }\r
126                                 }\r
127                         }\r
128                 }\r
129                 breader.close();\r
130 \r
131                 // TODO improve this bit\r
132                 if (flag) {\r
133 \r
134                         // Add sequences to the hash\r
135                         seqs = new FastaSequence[headers.size()];\r
136                         for (int i = 0; i < headers.size(); i++) {\r
137                                 if (seqhash.get(headers.get(i)) != null) {\r
138 \r
139                                         FastaSequence newSeq = new FastaSequence(headers.get(i),\r
140                                                         seqhash.get(headers.get(i)).toString());\r
141 \r
142                                         seqs[i] = newSeq;\r
143 \r
144                                 } else {\r
145                                         // should not happened\r
146                                         throw new AssertionError(\r
147                                                         "Bizarreness! Can't find sequence for "\r
148                                                                         + headers.get(i));\r
149                                 }\r
150                         }\r
151                 }\r
152                 if (seqs == null || seqs.length == 0) {\r
153                         throw new UnknownFileFormatException(\r
154                                         "Input does not appear to be a clustal file! ");\r
155                 }\r
156                 return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(\r
157                                 Program.CLUSTAL, gapchar));\r
158         }\r
159 \r
160         /**\r
161          * Please note this method closes the input stream provided as a parameter\r
162          * \r
163          * @param input\r
164          * @return true if the file is recognised as Clustal formatted alignment,\r
165          *         false otherwise\r
166          */\r
167         public static boolean isValidClustalFile(InputStream input) {\r
168                 if (input == null) {\r
169                         throw new NullPointerException("Input is expected!");\r
170                 }\r
171                 BufferedReader breader = new BufferedReader(\r
172                                 new InputStreamReader(input));\r
173                 try {\r
174                         if (input.available() < 10) {\r
175                                 return false;\r
176                         }\r
177                         // read first 10 lines to find "Clustal"\r
178                         for (int i = 0; i < 10; i++) {\r
179                                 String line = breader.readLine();\r
180                                 if (line != null) {\r
181                                         line = line.toUpperCase().trim();\r
182                                         if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {\r
183                                                 return true;\r
184                                         }\r
185                                 }\r
186                         }\r
187 \r
188                         breader.close();\r
189                 } catch (IOException e) {\r
190                         log.severe("Could not read from the stream! "\r
191                                         + e.getLocalizedMessage() + e.getCause());\r
192                 } finally {\r
193                         SequenceUtil.closeSilently(log, breader);\r
194                 }\r
195                 return false;\r
196         }\r
197 \r
198         /**\r
199          * Write Clustal formatted alignment Limitations: does not record the\r
200          * consensus. Potential bug - records 60 chars length alignment where\r
201          * Clustal would have recorded 50 chars.\r
202          * \r
203          * @param outStream\r
204          * \r
205          * @param alignment\r
206          * @throws IOException\r
207          */\r
208         public static void writeClustalAlignment(final Writer out,\r
209                         final Alignment alignment) throws IOException {\r
210                 List<FastaSequence> seqs = alignment.getSequences();\r
211 \r
212                 out.write("CLUSTAL\n\n\n");\r
213 \r
214                 int max = 0;\r
215                 int maxidLength = 0;\r
216 \r
217                 int i = 0;\r
218                 // Find the longest sequence name\r
219                 for (FastaSequence fs : seqs) {\r
220                         String tmp = fs.getId();\r
221 \r
222                         if (fs.getSequence().length() > max) {\r
223                                 max = fs.getSequence().length();\r
224                         }\r
225                         if (tmp.length() > maxidLength) {\r
226                                 maxidLength = tmp.length();\r
227                         }\r
228                         i++;\r
229                 }\r
230                 if (maxidLength < minNameHolderLength) {\r
231                         maxidLength = minNameHolderLength;\r
232                 }\r
233                 if (maxidLength > maxNameLength) {\r
234                         maxidLength = 30; // the rest will be trimmed\r
235                 }\r
236 \r
237                 int oneLineAlignmentLength = 60;\r
238                 int nochunks = max / oneLineAlignmentLength + 1;\r
239 \r
240                 for (i = 0; i < nochunks; i++) {\r
241                         int j = 0;\r
242                         for (FastaSequence fs : seqs) {\r
243 \r
244                                 String name = fs.getId();\r
245                                 // display at most 30 characters in the name, keep the names\r
246                                 // 6 spaces away from the alignment for longest sequence names,\r
247                                 // and more than this for shorter names\r
248                                 out.write(String.format(\r
249                                                 "%-" + maxidLength + "s" + spacer,\r
250                                                 (name.length() > maxNameLength ? name.substring(0,\r
251                                                                 maxidLength) : name)));\r
252                                 int start = i * oneLineAlignmentLength;\r
253                                 int end = start + oneLineAlignmentLength;\r
254 \r
255                                 if (end < fs.getSequence().length()\r
256                                                 && start < fs.getSequence().length()) {\r
257                                         out.write(fs.getSequence().substring(start, end) + "\n");\r
258                                 } else {\r
259                                         if (start < fs.getSequence().length()) {\r
260                                                 out.write(fs.getSequence().substring(start) + "\n");\r
261                                         }\r
262                                 }\r
263                                 j++;\r
264                         }\r
265                         out.write("\n");\r
266                 }\r
267                 try {\r
268                         out.close();\r
269                 } finally {\r
270                         SequenceUtil.closeSilently(log, out);\r
271                 }\r
272         }\r
273 \r
274         public static Alignment readClustalFile(File file)\r
275                         throws UnknownFileFormatException, IOException {\r
276                 if (file == null) {\r
277                         throw new NullPointerException("File is expected!");\r
278                 }\r
279                 FileInputStream fio = new FileInputStream(file);\r
280                 Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);\r
281                 try {\r
282                         fio.close();\r
283                 } finally {\r
284                         SequenceUtil.closeSilently(log, fio);\r
285                 }\r
286                 return seqAl;\r
287         }\r
288 }\r