Changes from JWS2 branch merged, mostly javadoc
[jabaws.git] / datamodel / compbio / data / sequence / ClustalAlignmentUtil.java
1 /* Copyright (c) 2009 Peter Troshin\r
2  *  \r
3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 1.0 \r
4  * \r
5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
6  *  Apache License version 2 as published by the Apache Software Foundation\r
7  * \r
8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
10  *  License for more details.\r
11  * \r
12  *  A copy of the license is in apache_license.txt. It is also available here:\r
13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
14  * \r
15  * Any republication or derived work distributed in source code form\r
16  * must include this copyright and license notice.\r
17  */\r
18 \r
19 package compbio.data.sequence;\r
20 \r
21 import java.io.BufferedReader;\r
22 import java.io.File;\r
23 import java.io.FileInputStream;\r
24 import java.io.IOException;\r
25 import java.io.InputStream;\r
26 import java.io.InputStreamReader;\r
27 import java.io.OutputStream;\r
28 import java.io.OutputStreamWriter;\r
29 import java.io.PrintWriter;\r
30 import java.util.ArrayList;\r
31 import java.util.Arrays;\r
32 import java.util.HashMap;\r
33 import java.util.List;\r
34 import java.util.Map;\r
35 import java.util.StringTokenizer;\r
36 import java.util.logging.Logger;\r
37 \r
38 /**\r
39  * Tools to read and write clustal formated files\r
40  * \r
41  * @author Petr Troshin based on jimp class\r
42  * \r
43  * @version 1.0 September 2009\r
44  * \r
45  */\r
46 public final class ClustalAlignmentUtil {\r
47 \r
48         private static final Logger log = Logger\r
49                         .getLogger(ClustalAlignmentUtil.class.getCanonicalName());\r
50 \r
51         /**\r
52          * Dash char to be used as gap char in the alignments\r
53          */\r
54         public static final char gapchar = '-';\r
55 \r
56         /*\r
57          * Number of spaces separating the name and the sequence\r
58          */\r
59         private static final String spacer = "      "; // 6 space characters\r
60         /*\r
61          * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is\r
62          * longer than that it gets trimmed in the end\r
63          */\r
64         private static final int maxNameLength = 30; // Maximum name length\r
65         /*\r
66          * If all sequences names in the alignment is shorter than\r
67          * minNameHolderLength than spaces are added to complete the name up to\r
68          * minNameHolderLength\r
69          */\r
70         private static final int minNameHolderLength = 10; // Minimum number of\r
71 \r
72         // TODO check whether clustal still loads data if length is 60!\r
73         private static final int oneLineAlignmentLength = 60; // this could in fact\r
74 \r
75         // be 50\r
76 \r
77         // for long names ~30 chars\r
78 \r
79         /**\r
80          * Read Clustal formatted alignment. Limitations: Does not read consensus\r
81          * \r
82          * Sequence names as well as the sequences are not guaranteed to be unique!\r
83          * \r
84          * @throws {@link IOException}\r
85          * @throws {@link UnknownFileFormatException}\r
86          */\r
87         public static Alignment readClustalFile(InputStream instream)\r
88                         throws IOException, UnknownFileFormatException {\r
89 \r
90                 boolean flag = false;\r
91 \r
92                 List<String> headers = new ArrayList<String>();\r
93                 Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();\r
94                 FastaSequence[] seqs = null;\r
95 \r
96                 String line;\r
97 \r
98                 BufferedReader breader = new BufferedReader(new InputStreamReader(\r
99                                 instream));\r
100                 while ((line = breader.readLine()) != null) {\r
101                         if (line.indexOf(" ") != 0) {\r
102                                 java.util.StringTokenizer str = new StringTokenizer(line, " ");\r
103                                 String id = "";\r
104 \r
105                                 if (str.hasMoreTokens()) {\r
106                                         id = str.nextToken();\r
107                                         // PROBCONS output clustal formatted file with not mention\r
108                                         // of CLUSTAL (:-))\r
109                                         if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {\r
110                                                 flag = true;\r
111                                         } else {\r
112                                                 if (flag) {\r
113                                                         StringBuffer tempseq;\r
114                                                         if (seqhash.containsKey(id)) {\r
115                                                                 tempseq = seqhash.get(id);\r
116                                                         } else {\r
117                                                                 tempseq = new StringBuffer();\r
118                                                                 seqhash.put(id, tempseq);\r
119                                                         }\r
120 \r
121                                                         if (!(headers.contains(id))) {\r
122                                                                 headers.add(id);\r
123                                                         }\r
124 \r
125                                                         tempseq.append(str.nextToken());\r
126                                                 }\r
127                                         }\r
128                                 }\r
129                         }\r
130                 }\r
131                 breader.close();\r
132 \r
133                 // TODO improve this bit\r
134                 if (flag) {\r
135 \r
136                         // Add sequences to the hash\r
137                         seqs = new FastaSequence[headers.size()];\r
138                         for (int i = 0; i < headers.size(); i++) {\r
139                                 if (seqhash.get(headers.get(i)) != null) {\r
140 \r
141                                         FastaSequence newSeq = new FastaSequence(headers.get(i),\r
142                                                         seqhash.get(headers.get(i)).toString());\r
143 \r
144                                         seqs[i] = newSeq;\r
145 \r
146                                 } else {\r
147                                         // should not happened\r
148                                         throw new AssertionError(\r
149                                                         "Bizarreness! Can't find sequence for "\r
150                                                                         + headers.get(i));\r
151                                 }\r
152                         }\r
153                 }\r
154                 if (seqs == null || seqs.length == 0) {\r
155                         throw new UnknownFileFormatException(\r
156                                         "Input does not appear to be a clustal file! ");\r
157                 }\r
158                 return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(\r
159                                 Program.CLUSTAL, gapchar));\r
160         }\r
161 \r
162         /**\r
163          * Please note this method closes the input stream provided as a parameter\r
164          * \r
165          * @param input\r
166          * @return true if the file is recognised as Clustal formatted alignment,\r
167          *         false otherwise\r
168          */\r
169         public static boolean isValidClustalFile(InputStream input) {\r
170                 if (input == null) {\r
171                         throw new NullPointerException("Input is expected!");\r
172                 }\r
173                 BufferedReader breader = new BufferedReader(\r
174                                 new InputStreamReader(input));\r
175                 try {\r
176                         if (input.available() < 10) {\r
177                                 return false;\r
178                         }\r
179                         // read first 10 lines to find "Clustal"\r
180                         for (int i = 0; i < 10; i++) {\r
181                                 String line = breader.readLine();\r
182                                 if (line != null) {\r
183                                         line = line.toUpperCase().trim();\r
184                                         if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {\r
185                                                 return true;\r
186                                         }\r
187                                 }\r
188                         }\r
189 \r
190                         breader.close();\r
191                 } catch (IOException e) {\r
192                         log.severe("Could not read from the stream! "\r
193                                         + e.getLocalizedMessage() + e.getCause());\r
194                 } finally {\r
195                         SequenceUtil.closeSilently(log, breader);\r
196                 }\r
197                 return false;\r
198         }\r
199 \r
200         /**\r
201          * Write Clustal formatted alignment Limitations: does not record the\r
202          * consensus. Potential bug - records 60 chars length alignment where\r
203          * Clustal would have recorded 50 chars.\r
204          * \r
205          * @param outStream\r
206          * \r
207          * @param alignment\r
208          * @throws IOException\r
209          */\r
210         public static void writeClustalAlignment(final OutputStream outStream,\r
211                         final Alignment alignment) throws IOException {\r
212                 List<FastaSequence> seqs = alignment.getSequences();\r
213 \r
214                 PrintWriter out = new PrintWriter(new OutputStreamWriter(outStream));\r
215 \r
216                 out.write("CLUSTAL\n\n\n");\r
217 \r
218                 int max = 0;\r
219                 int maxidLength = 0;\r
220 \r
221                 int i = 0;\r
222                 // Find the longest sequence name\r
223                 for (FastaSequence fs : seqs) {\r
224                         String tmp = fs.getId();\r
225 \r
226                         if (fs.getSequence().length() > max) {\r
227                                 max = fs.getSequence().length();\r
228                         }\r
229                         if (tmp.length() > maxidLength) {\r
230                                 maxidLength = tmp.length();\r
231                         }\r
232                         i++;\r
233                 }\r
234                 if (maxidLength < minNameHolderLength) {\r
235                         maxidLength = minNameHolderLength;\r
236                 }\r
237                 if (maxidLength > maxNameLength) {\r
238                         maxidLength = 30; // the rest will be trimmed\r
239                 }\r
240 \r
241                 int oneLineAlignmentLength = 60;\r
242                 int nochunks = max / oneLineAlignmentLength + 1;\r
243 \r
244                 for (i = 0; i < nochunks; i++) {\r
245                         int j = 0;\r
246                         for (FastaSequence fs : seqs) {\r
247 \r
248                                 String name = fs.getId();\r
249                                 // display at most 30 characters in the name, keep the names\r
250                                 // 6 spaces away from the alignment for longest sequence names,\r
251                                 // and more than this for shorter names\r
252                                 out.format(\r
253                                                 "%-" + maxidLength + "s" + spacer,\r
254                                                 (name.length() > maxNameLength ? name.substring(0,\r
255                                                                 maxidLength) : name));\r
256                                 int start = i * oneLineAlignmentLength;\r
257                                 int end = start + oneLineAlignmentLength;\r
258 \r
259                                 if (end < fs.getSequence().length()\r
260                                                 && start < fs.getSequence().length()) {\r
261                                         out.write(fs.getSequence().substring(start, end) + "\n");\r
262                                 } else {\r
263                                         if (start < fs.getSequence().length()) {\r
264                                                 out.write(fs.getSequence().substring(start) + "\n");\r
265                                         }\r
266                                 }\r
267                                 j++;\r
268                         }\r
269                         out.write("\n");\r
270                 }\r
271                 try {\r
272                         out.close();\r
273                 } finally {\r
274                         SequenceUtil.closeSilently(log, out);\r
275                 }\r
276         }\r
277 \r
278         public static Alignment readClustalFile(File file)\r
279                         throws UnknownFileFormatException, IOException {\r
280                 if (file == null) {\r
281                         throw new NullPointerException("File is expected!");\r
282                 }\r
283                 FileInputStream fio = new FileInputStream(file);\r
284                 Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);\r
285                 try {\r
286                         fio.close();\r
287                 } finally {\r
288                         SequenceUtil.closeSilently(log, fio);\r
289                 }\r
290                 return seqAl;\r
291         }\r
292 }\r