Adding AAConWS
[jabaws.git] / datamodel / compbio / data / sequence / ClustalAlignmentUtil.java
1 /*\r
2  * Copyright (c) 2009 Peter Troshin JAva Bioinformatics Analysis Web Services\r
3  * (JABAWS) @version: 1.0 This library is free software; you can redistribute it\r
4  * and/or modify it under the terms of the Apache License version 2 as published\r
5  * by the Apache Software Foundation This library is distributed in the hope\r
6  * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied\r
7  * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\r
8  * Apache License for more details. A copy of the license is in\r
9  * apache_license.txt. It is also available here:\r
10  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or\r
11  * derived work distributed in source code form must include this copyright and\r
12  * license notice.\r
13  */\r
14 \r
15 package compbio.data.sequence;\r
16 \r
17 import java.io.BufferedReader;\r
18 import java.io.File;\r
19 import java.io.FileInputStream;\r
20 import java.io.IOException;\r
21 import java.io.InputStream;\r
22 import java.io.InputStreamReader;\r
23 import java.io.OutputStream;\r
24 import java.io.OutputStreamWriter;\r
25 import java.io.PrintWriter;\r
26 import java.util.ArrayList;\r
27 import java.util.Arrays;\r
28 import java.util.HashMap;\r
29 import java.util.List;\r
30 import java.util.Map;\r
31 import java.util.StringTokenizer;\r
32 import java.util.logging.Logger;\r
33 \r
34 /**\r
35  * Tools to read and write clustal formated files\r
36  * \r
37  * @author Petr Troshin based on jimp class\r
38  * \r
39  *         Date September 2009\r
40  * \r
41  */\r
42 public final class ClustalAlignmentUtil {\r
43 \r
44         private static final Logger log = Logger\r
45                         .getLogger(ClustalAlignmentUtil.class.getCanonicalName());\r
46 \r
47         /**\r
48          * Dash char to be used as gap char in the alignments\r
49          */\r
50         public static final char gapchar = '-';\r
51 \r
52         /*\r
53          * Number of spaces separating the name and the sequence\r
54          */\r
55         private static final String spacer = "      "; // 6 space characters\r
56         /*\r
57          * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is\r
58          * longer than that it gets trimmed in the end\r
59          */\r
60         private static final int maxNameLength = 30; // Maximum name length\r
61         /*\r
62          * If all sequences names in the alignment is shorter than\r
63          * minNameHolderLength than spaces are added to complete the name up to\r
64          * minNameHolderLength\r
65          */\r
66         private static final int minNameHolderLength = 10; // Minimum number of\r
67 \r
68         // TODO check whether clustal still loads data if length is 60!\r
69         private static final int oneLineAlignmentLength = 60; // this could in fact\r
70 \r
71         // be 50\r
72 \r
73         // for long names ~30 chars\r
74 \r
75         /**\r
76          * Read Clustal formatted alignment. Limitations: Does not read consensus\r
77          * \r
78          * Sequence names as well as the sequences are not guaranteed to be unique!\r
79          * \r
80          * @throws {@link IOException}\r
81          * @throws {@link UnknownFileFormatException}\r
82          */\r
83         public static Alignment readClustalFile(InputStream instream)\r
84                         throws IOException, UnknownFileFormatException {\r
85 \r
86                 boolean flag = false;\r
87 \r
88                 List<String> headers = new ArrayList<String>();\r
89                 Map<String, StringBuffer> seqhash = new HashMap<String, StringBuffer>();\r
90                 FastaSequence[] seqs = null;\r
91 \r
92                 String line;\r
93 \r
94                 BufferedReader breader = new BufferedReader(new InputStreamReader(\r
95                                 instream));\r
96                 while ((line = breader.readLine()) != null) {\r
97                         if (line.indexOf(" ") != 0) {\r
98                                 java.util.StringTokenizer str = new StringTokenizer(line, " ");\r
99                                 String id = "";\r
100 \r
101                                 if (str.hasMoreTokens()) {\r
102                                         id = str.nextToken();\r
103                                         // PROBCONS output clustal formatted file with not mention\r
104                                         // of CLUSTAL (:-))\r
105                                         if (id.equals("CLUSTAL") || id.equals("PROBCONS")) {\r
106                                                 flag = true;\r
107                                         } else {\r
108                                                 if (flag) {\r
109                                                         StringBuffer tempseq;\r
110                                                         if (seqhash.containsKey(id)) {\r
111                                                                 tempseq = seqhash.get(id);\r
112                                                         } else {\r
113                                                                 tempseq = new StringBuffer();\r
114                                                                 seqhash.put(id, tempseq);\r
115                                                         }\r
116 \r
117                                                         if (!(headers.contains(id))) {\r
118                                                                 headers.add(id);\r
119                                                         }\r
120 \r
121                                                         tempseq.append(str.nextToken());\r
122                                                 }\r
123                                         }\r
124                                 }\r
125                         }\r
126                 }\r
127                 breader.close();\r
128 \r
129                 // TODO improve this bit\r
130                 if (flag) {\r
131 \r
132                         // Add sequences to the hash\r
133                         seqs = new FastaSequence[headers.size()];\r
134                         for (int i = 0; i < headers.size(); i++) {\r
135                                 if (seqhash.get(headers.get(i)) != null) {\r
136 \r
137                                         FastaSequence newSeq = new FastaSequence(headers.get(i),\r
138                                                         seqhash.get(headers.get(i)).toString());\r
139 \r
140                                         seqs[i] = newSeq;\r
141 \r
142                                 } else {\r
143                                         // should not happened\r
144                                         throw new AssertionError(\r
145                                                         "Bizarreness! Can't find sequence for "\r
146                                                                         + headers.get(i));\r
147                                 }\r
148                         }\r
149                 }\r
150                 if (seqs == null || seqs.length == 0) {\r
151                         throw new UnknownFileFormatException(\r
152                                         "Input does not appear to be a clustal file! ");\r
153                 }\r
154                 return new Alignment(Arrays.asList(seqs), new AlignmentMetadata(\r
155                                 Program.CLUSTAL, gapchar));\r
156         }\r
157 \r
158         /**\r
159          * Please note this method closes the input stream provided as a parameter\r
160          * \r
161          * @param input\r
162          * @return true if the file is recognised as Clustal formatted alignment,\r
163          *         false otherwise\r
164          */\r
165         public static boolean isValidClustalFile(InputStream input) {\r
166                 if (input == null) {\r
167                         throw new NullPointerException("Input is expected!");\r
168                 }\r
169                 BufferedReader breader = new BufferedReader(\r
170                                 new InputStreamReader(input));\r
171                 try {\r
172                         if (input.available() < 10) {\r
173                                 return false;\r
174                         }\r
175                         // read first 10 lines to find "Clustal"\r
176                         for (int i = 0; i < 10; i++) {\r
177                                 String line = breader.readLine();\r
178                                 if (line != null) {\r
179                                         line = line.toUpperCase().trim();\r
180                                         if (line.contains("CLUSTAL") || line.contains("PROBCONS")) {\r
181                                                 return true;\r
182                                         }\r
183                                 }\r
184                         }\r
185 \r
186                         breader.close();\r
187                 } catch (IOException e) {\r
188                         log.severe("Could not read from the stream! "\r
189                                         + e.getLocalizedMessage() + e.getCause());\r
190                 } finally {\r
191                         SequenceUtil.closeSilently(log, breader);\r
192                 }\r
193                 return false;\r
194         }\r
195 \r
196         /**\r
197          * Write Clustal formatted alignment Limitations: does not record the\r
198          * consensus. Potential bug - records 60 chars length alignment where\r
199          * Clustal would have recorded 50 chars.\r
200          * \r
201          * @param outStream\r
202          * \r
203          * @param alignment\r
204          * @throws IOException\r
205          */\r
206         public static void writeClustalAlignment(final OutputStream outStream,\r
207                         final Alignment alignment) throws IOException {\r
208                 List<FastaSequence> seqs = alignment.getSequences();\r
209 \r
210                 PrintWriter out = new PrintWriter(new OutputStreamWriter(outStream));\r
211 \r
212                 out.write("CLUSTAL\n\n\n");\r
213 \r
214                 int max = 0;\r
215                 int maxidLength = 0;\r
216 \r
217                 int i = 0;\r
218                 // Find the longest sequence name\r
219                 for (FastaSequence fs : seqs) {\r
220                         String tmp = fs.getId();\r
221 \r
222                         if (fs.getSequence().length() > max) {\r
223                                 max = fs.getSequence().length();\r
224                         }\r
225                         if (tmp.length() > maxidLength) {\r
226                                 maxidLength = tmp.length();\r
227                         }\r
228                         i++;\r
229                 }\r
230                 if (maxidLength < minNameHolderLength) {\r
231                         maxidLength = minNameHolderLength;\r
232                 }\r
233                 if (maxidLength > maxNameLength) {\r
234                         maxidLength = 30; // the rest will be trimmed\r
235                 }\r
236 \r
237                 int oneLineAlignmentLength = 60;\r
238                 int nochunks = max / oneLineAlignmentLength + 1;\r
239 \r
240                 for (i = 0; i < nochunks; i++) {\r
241                         int j = 0;\r
242                         for (FastaSequence fs : seqs) {\r
243 \r
244                                 String name = fs.getId();\r
245                                 // display at most 30 characters in the name, keep the names\r
246                                 // 6 spaces away from the alignment for longest sequence names,\r
247                                 // and more than this for shorter names\r
248                                 out.format("%-" + maxidLength + "s" + spacer,\r
249                                                 (name.length() > maxNameLength ? name.substring(0,\r
250                                                                 maxidLength) : name));\r
251                                 int start = i * oneLineAlignmentLength;\r
252                                 int end = start + oneLineAlignmentLength;\r
253 \r
254                                 if (end < fs.getSequence().length()\r
255                                                 && start < fs.getSequence().length()) {\r
256                                         out.write(fs.getSequence().substring(start, end) + "\n");\r
257                                 } else {\r
258                                         if (start < fs.getSequence().length()) {\r
259                                                 out.write(fs.getSequence().substring(start) + "\n");\r
260                                         }\r
261                                 }\r
262                                 j++;\r
263                         }\r
264                         out.write("\n");\r
265                 }\r
266                 try {\r
267                         out.close();\r
268                 } finally {\r
269                         SequenceUtil.closeSilently(log, out);\r
270                 }\r
271         }\r
272 \r
273         public static Alignment readClustalFile(File file)\r
274                         throws UnknownFileFormatException, IOException {\r
275                 if (file == null) {\r
276                         throw new NullPointerException("File is expected!");\r
277                 }\r
278                 FileInputStream fio = new FileInputStream(file);\r
279                 Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio);\r
280                 try {\r
281                         fio.close();\r
282                 } finally {\r
283                         SequenceUtil.closeSilently(log, fio);\r
284                 }\r
285                 return seqAl;\r
286         }\r
287 }\r