X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=datamodel%2Fcompbio%2Fdata%2Fsequence%2FClustalAlignmentUtil.java;h=fc44c441b464106fb7a11dd19c1a7078cce89243;hb=b7a076f21e11b8cfa99f9e52759dcff702b957fd;hp=5fce997ec776122a30bb78e23b580d55a2db6d60;hpb=535359a3d592ee41bda72e7356f0181f6cee9d07;p=jabaws.git diff --git a/datamodel/compbio/data/sequence/ClustalAlignmentUtil.java b/datamodel/compbio/data/sequence/ClustalAlignmentUtil.java index 5fce997..fc44c44 100644 --- a/datamodel/compbio/data/sequence/ClustalAlignmentUtil.java +++ b/datamodel/compbio/data/sequence/ClustalAlignmentUtil.java @@ -24,9 +24,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; +import java.io.Writer; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -40,251 +38,251 @@ import java.util.logging.Logger; * * @author Petr Troshin based on jimp class * - * Date September 2009 + * @version 1.0 September 2009 * */ public final class ClustalAlignmentUtil { - private static final Logger log = Logger - .getLogger(ClustalAlignmentUtil.class.getCanonicalName()); - - /** - * Dash char to be used as gap char in the alignments - */ - public static final char gapchar = '-'; - - /* - * Number of spaces separating the name and the sequence - */ - private static final String spacer = " "; // 6 space characters - /* - * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is - * longer than that it gets trimmed in the end - */ - private static final int maxNameLength = 30; // Maximum name length - /* - * If all sequences names in the alignment is shorter than - * minNameHolderLength than spaces are added to complete the name up to - * minNameHolderLength - */ - private static final int minNameHolderLength = 10; // Minimum number of - - // TODO check whether clustal still loads data if length is 60! - private static final int oneLineAlignmentLength = 60; // this could in fact - - // be 50 - - // for long names ~30 chars - - /** - * Read Clustal formatted alignment. Limitations: Does not read consensus - * - * Sequence names as well as the sequences are not guaranteed to be unique! - * - * @throws {@link IOException} - * @throws {@link UnknownFileFormatException} - */ - public static Alignment readClustalFile(InputStream instream) - throws IOException, UnknownFileFormatException { - - boolean flag = false; - - List headers = new ArrayList(); - Map seqhash = new HashMap(); - FastaSequence[] seqs = null; - - String line; - - BufferedReader breader = new BufferedReader(new InputStreamReader( - instream)); - while ((line = breader.readLine()) != null) { - if (line.indexOf(" ") != 0) { - java.util.StringTokenizer str = new StringTokenizer(line, " "); - String id = ""; - - if (str.hasMoreTokens()) { - id = str.nextToken(); - // PROBCONS output clustal formatted file with not mention - // of CLUSTAL (:-)) - if (id.equals("CLUSTAL") || id.equals("PROBCONS")) { - flag = true; - } else { - if (flag) { - StringBuffer tempseq; - if (seqhash.containsKey(id)) { - tempseq = seqhash.get(id); - } else { - tempseq = new StringBuffer(); - seqhash.put(id, tempseq); - } - - if (!(headers.contains(id))) { - headers.add(id); - } - - tempseq.append(str.nextToken()); + private static final Logger log = Logger + .getLogger(ClustalAlignmentUtil.class.getCanonicalName()); + + /** + * Dash char to be used as gap char in the alignments + */ + public static final char gapchar = '-'; + + /* + * Number of spaces separating the name and the sequence + */ + private static final String spacer = " "; // 6 space characters + /* + * name length limit is 30 characters! 2.0.7 - 2.0.12 clustalw /* if name is + * longer than that it gets trimmed in the end + */ + private static final int maxNameLength = 30; // Maximum name length + /* + * If all sequences names in the alignment is shorter than + * minNameHolderLength than spaces are added to complete the name up to + * minNameHolderLength + */ + private static final int minNameHolderLength = 10; // Minimum number of + + // TODO check whether clustal still loads data if length is 60! + private static final int oneLineAlignmentLength = 60; // this could in fact + + // be 50 + + // for long names ~30 chars + + /** + * Read Clustal formatted alignment. Limitations: Does not read consensus + * + * Sequence names as well as the sequences are not guaranteed to be unique! + * + * @throws {@link IOException} + * @throws {@link UnknownFileFormatException} + */ + public static Alignment readClustalFile(InputStream instream) + throws IOException, UnknownFileFormatException { + + boolean flag = false; + + List headers = new ArrayList(); + Map seqhash = new HashMap(); + FastaSequence[] seqs = null; + + String line; + + BufferedReader breader = new BufferedReader(new InputStreamReader( + instream)); + while ((line = breader.readLine()) != null) { + if (line.indexOf(" ") != 0) { + java.util.StringTokenizer str = new StringTokenizer(line, " "); + String id = ""; + + if (str.hasMoreTokens()) { + id = str.nextToken(); + // PROBCONS output clustal formatted file with not mention + // of CLUSTAL (:-)) + if (id.equals("CLUSTAL") || id.equals("PROBCONS")) { + flag = true; + } else { + if (flag) { + StringBuffer tempseq; + if (seqhash.containsKey(id)) { + tempseq = seqhash.get(id); + } else { + tempseq = new StringBuffer(); + seqhash.put(id, tempseq); + } + + if (!(headers.contains(id))) { + headers.add(id); + } + + tempseq.append(str.nextToken()); + } + } + } } - } } - } - } - breader.close(); + breader.close(); - // TODO improve this bit - if (flag) { + // TODO improve this bit + if (flag) { - // Add sequences to the hash - seqs = new FastaSequence[headers.size()]; - for (int i = 0; i < headers.size(); i++) { - if (seqhash.get(headers.get(i)) != null) { + // Add sequences to the hash + seqs = new FastaSequence[headers.size()]; + for (int i = 0; i < headers.size(); i++) { + if (seqhash.get(headers.get(i)) != null) { - FastaSequence newSeq = new FastaSequence(headers.get(i), - seqhash.get(headers.get(i)).toString()); + FastaSequence newSeq = new FastaSequence(headers.get(i), + seqhash.get(headers.get(i)).toString()); - seqs[i] = newSeq; + seqs[i] = newSeq; - } else { - // should not happened - throw new AssertionError( - "Bizarreness! Can't find sequence for " - + headers.get(i)); + } else { + // should not happened + throw new AssertionError( + "Bizarreness! Can't find sequence for " + + headers.get(i)); + } + } } - } - } - if (seqs == null || seqs.length == 0) { - throw new UnknownFileFormatException( - "Input does not appear to be a clustal file! "); - } - return new Alignment(Arrays.asList(seqs), new AlignmentMetadata( - Program.CLUSTAL, gapchar)); - } - - /** - * - * @param input - * @return true if the file is recognised as Clustal formatted alignment, - * false otherwise - */ - public static boolean isValidClustalFile(InputStream input) { - if (input == null) { - throw new NullPointerException("Input is expected!"); - } - BufferedReader breader = new BufferedReader( - new InputStreamReader(input)); - try { - if (input.available() < 10) { - return false; - } - // read first 10 lines to find "Clustal" - for (int i = 0; i < 10; i++) { - String line = breader.readLine(); - if (line != null) { - line = line.toUpperCase().trim(); - if (line.contains("CLUSTAL") || line.contains("PROBCONS")) { - return true; - } + if (seqs == null || seqs.length == 0) { + throw new UnknownFileFormatException( + "Input does not appear to be a clustal file! "); } - } - - breader.close(); - } catch (IOException e) { - log.severe("Could not read from the stream! " - + e.getLocalizedMessage() + e.getCause()); - } finally { - SequenceUtil.closeSilently(log, breader); - } - return false; - } - - /** - * Write Clustal formatted alignment Limitations: does not record the - * consensus. Potential bug - records 60 chars length alignment where - * Clustal would have recorded 50 chars. - * - * @param outStream - * - * @param alignment - * @throws IOException - */ - public static void writeClustalAlignment(final OutputStream outStream, - final Alignment alignment) throws IOException { - List seqs = alignment.getSequences(); - - PrintWriter out = new PrintWriter(new OutputStreamWriter(outStream)); - - out.write("CLUSTAL\n\n\n"); - - int max = 0; - int maxidLength = 0; - - int i = 0; - // Find the longest sequence name - for (FastaSequence fs : seqs) { - String tmp = fs.getId(); - - if (fs.getSequence().length() > max) { - max = fs.getSequence().length(); - } - if (tmp.length() > maxidLength) { - maxidLength = tmp.length(); - } - i++; - } - if (maxidLength < minNameHolderLength) { - maxidLength = minNameHolderLength; - } - if (maxidLength > maxNameLength) { - maxidLength = 30; // the rest will be trimmed + return new Alignment(Arrays.asList(seqs), new AlignmentMetadata( + Program.CLUSTAL, gapchar)); } - int oneLineAlignmentLength = 60; - int nochunks = max / oneLineAlignmentLength + 1; - - for (i = 0; i < nochunks; i++) { - int j = 0; - for (FastaSequence fs : seqs) { - - String name = fs.getId(); - // display at most 30 characters in the name, keep the names - // 6 spaces away from the alignment for longest sequence names, - // and more than this for shorter names - out.format("%-" + maxidLength + "s" + spacer, - (name.length() > maxNameLength ? name.substring(0, - maxidLength) : name)); - int start = i * oneLineAlignmentLength; - int end = start + oneLineAlignmentLength; - - if (end < fs.getSequence().length() - && start < fs.getSequence().length()) { - out.write(fs.getSequence().substring(start, end) + "\n"); - } else { - if (start < fs.getSequence().length()) { - out.write(fs.getSequence().substring(start) + "\n"); - } + /** + * Please note this method closes the input stream provided as a parameter + * + * @param input + * @return true if the file is recognised as Clustal formatted alignment, + * false otherwise + */ + public static boolean isValidClustalFile(InputStream input) { + if (input == null) { + throw new NullPointerException("Input is expected!"); } - j++; - } - out.write("\n"); - } - try { - out.close(); - } finally { - SequenceUtil.closeSilently(log, out); + BufferedReader breader = new BufferedReader( + new InputStreamReader(input)); + try { + if (input.available() < 10) { + return false; + } + // read first 10 lines to find "Clustal" + for (int i = 0; i < 10; i++) { + String line = breader.readLine(); + if (line != null) { + line = line.toUpperCase().trim(); + if (line.contains("CLUSTAL") || line.contains("PROBCONS")) { + return true; + } + } + } + + breader.close(); + } catch (IOException e) { + log.severe("Could not read from the stream! " + + e.getLocalizedMessage() + e.getCause()); + } finally { + SequenceUtil.closeSilently(log, breader); + } + return false; } - } - public static Alignment readClustalFile(File file) - throws UnknownFileFormatException, IOException { - if (file == null) { - throw new NullPointerException("File is expected!"); + /** + * Write Clustal formatted alignment Limitations: does not record the + * consensus. Potential bug - records 60 chars length alignment where + * Clustal would have recorded 50 chars. + * + * @param out + * + * @param alignment + * @throws IOException + */ + public static void writeClustalAlignment(final Writer out, + final Alignment alignment) throws IOException { + List seqs = alignment.getSequences(); + + out.write("CLUSTAL\n\n\n"); + + int max = 0; + int maxidLength = 0; + + int i = 0; + // Find the longest sequence name + for (FastaSequence fs : seqs) { + String tmp = fs.getId(); + + if (fs.getSequence().length() > max) { + max = fs.getSequence().length(); + } + if (tmp.length() > maxidLength) { + maxidLength = tmp.length(); + } + i++; + } + if (maxidLength < minNameHolderLength) { + maxidLength = minNameHolderLength; + } + if (maxidLength > maxNameLength) { + maxidLength = 30; // the rest will be trimmed + } + + int oneLineAlignmentLength = 60; + int nochunks = max / oneLineAlignmentLength + 1; + + for (i = 0; i < nochunks; i++) { + int j = 0; + for (FastaSequence fs : seqs) { + + String name = fs.getId(); + // display at most 30 characters in the name, keep the names + // 6 spaces away from the alignment for longest sequence names, + // and more than this for shorter names + out.write(String.format( + "%-" + maxidLength + "s" + spacer, + (name.length() > maxNameLength ? name.substring(0, + maxidLength) : name))); + int start = i * oneLineAlignmentLength; + int end = start + oneLineAlignmentLength; + + if (end < fs.getSequence().length() + && start < fs.getSequence().length()) { + out.write(fs.getSequence().substring(start, end) + "\n"); + } else { + if (start < fs.getSequence().length()) { + out.write(fs.getSequence().substring(start) + "\n"); + } + } + j++; + } + out.write("\n"); + } + try { + out.close(); + } finally { + SequenceUtil.closeSilently(log, out); + } } - FileInputStream fio = new FileInputStream(file); - Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio); - try { - fio.close(); - } finally { - SequenceUtil.closeSilently(log, fio); + + public static Alignment readClustalFile(File file) + throws UnknownFileFormatException, IOException { + if (file == null) { + throw new NullPointerException("File is expected!"); + } + FileInputStream fio = new FileInputStream(file); + Alignment seqAl = ClustalAlignmentUtil.readClustalFile(fio); + try { + fio.close(); + } finally { + SequenceUtil.closeSilently(log, fio); + } + return seqAl; } - return seqAl; - } }