datamodel/compbio/data/sequence/SequenceUtil.java

   1 /* \r
   2  * @(#)SequenceUtil.java 1.0 September 2009\r
   3  * \r
   4  * Copyright (c) 2009 Peter Troshin\r
   5  *  \r
   6  * Jalview Web Services version: 2.0     \r
   7  * \r
   8  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
   9  *  Apache License version 2 as published by the Apache Software Foundation\r
  10  * \r
  11  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
  12  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
  13  *  License for more details.\r
  14  * \r
  15  *  A copy of the license is in apache_license.txt. It is also available here:\r
  16  * see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
  17  * \r
  18  * Any republication or derived work distributed in source code form\r
  19  * must include this copyright and license notice.\r
  20  */\r
  21 \r
  22 package compbio.data.sequence;\r
  23 \r
  24 import java.io.BufferedReader;\r
  25 import java.io.BufferedWriter;\r
  26 import java.io.Closeable;\r
  27 import java.io.File;\r
  28 import java.io.FileInputStream;\r
  29 import java.io.IOException;\r
  30 import java.io.InputStream;\r
  31 import java.io.InputStreamReader;\r
  32 import java.io.OutputStream;\r
  33 import java.io.OutputStreamWriter;\r
  34 import java.util.ArrayList;\r
  35 import java.util.List;\r
  36 import java.util.logging.Level;\r
  37 import java.util.regex.Matcher;\r
  38 import java.util.regex.Pattern;\r
  39 \r
  40 /**\r
  41  * Utility class for operations on sequences\r
  42  * \r
  43  * @author Petr Troshin\r
  44  * @version 1.0\r
  45  */\r
  46 public final class SequenceUtil {\r
  47 \r
  48     /**\r
  49      * A whitespace character: [\t\n\x0B\f\r]\r
  50      */\r
  51     public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  52 \r
  53     /**\r
  54      * A digit\r
  55      */\r
  56     public static final Pattern DIGIT = Pattern.compile("\\d");\r
  57 \r
  58     /**\r
  59      * Non word\r
  60      */\r
  61     public static final Pattern NONWORD = Pattern.compile("\\W");\r
  62 \r
  63     /**\r
  64      * Valid Amino acids\r
  65      */\r
  66     public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  67             Pattern.CASE_INSENSITIVE);\r
  68 \r
  69     /**\r
  70      * inversion of AA pattern\r
  71      */\r
  72     public static final Pattern NON_AA = Pattern.compile(\r
  73             "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  74 \r
  75     /**\r
  76      * Same as AA pattern but with two additional letters - XU\r
  77      */\r
  78     public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  79             "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  80 \r
  81     /**\r
  82      * Nucleotides a, t, g, c, u\r
  83      */\r
  84     public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  85             Pattern.CASE_INSENSITIVE);\r
  86 \r
  87     /**\r
  88      * Ambiguous nucleotide\r
  89      */\r
  90     public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
  91             "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
  92     /**\r
  93      * Non nucleotide\r
  94      */\r
  95     public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
  96             Pattern.CASE_INSENSITIVE);\r
  97 \r
  98     private SequenceUtil() {\r
  99     } // utility class, no instantiation\r
 100 \r
 101     /*\r
 102      * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
 103      * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
 104      * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
 105      * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
 106      * SysPrefs.newlinechar); pir_out.close(); }\r
 107      * \r
 108      * public static void write_FastaSeq(OutputStream os, FastaSequence seq)\r
 109      * throws IOException { BufferedWriter fasta_out = new BufferedWriter( new\r
 110      * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
 111      * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
 112      * SysPrefs.newlinechar); fasta_out.close(); }\r
 113      */\r
 114 \r
 115     /**\r
 116      * @return true is the sequence contains only letters a,c, t, g, u\r
 117      */\r
 118     public static boolean isNucleotideSequence(final FastaSequence s) {\r
 119         return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 120     }\r
 121 \r
 122     /**\r
 123      * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 124      * (!) - B char\r
 125      */\r
 126     public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 127         sequence = SequenceUtil.cleanSequence(sequence);\r
 128         if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 129             return false;\r
 130         }\r
 131         if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 132             return false;\r
 133             /*\r
 134              * System.out.format("I found the text starting at " +\r
 135              * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 136              * nonDNAmatcher.end());\r
 137              */\r
 138         }\r
 139         final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 140         return DNAmatcher.find();\r
 141     }\r
 142 \r
 143     /**\r
 144      * Removes all whitespace chars in the sequence string\r
 145      * \r
 146      * @param sequence\r
 147      * @return cleaned up sequence\r
 148      */\r
 149     public static String cleanSequence(String sequence) {\r
 150         assert sequence != null;\r
 151         final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 152         sequence = m.replaceAll("").toUpperCase();\r
 153         return sequence;\r
 154     }\r
 155 \r
 156     /**\r
 157      * Removes all special characters and digits as well as whitespace chars\r
 158      * from the sequence\r
 159      * \r
 160      * @param sequence\r
 161      * @return cleaned up sequence\r
 162      */\r
 163     public static String deepCleanSequence(String sequence) {\r
 164         sequence = SequenceUtil.cleanSequence(sequence);\r
 165         sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 166         sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 167         final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 168         sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 169         return sequence;\r
 170     }\r
 171 \r
 172     /**\r
 173      * \r
 174      * @param sequence\r
 175      * @return true is the sequence is a protein sequence, false overwise\r
 176      */\r
 177     public static boolean isProteinSequence(String sequence) {\r
 178         sequence = SequenceUtil.cleanSequence(sequence);\r
 179         if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 180             return false;\r
 181         }\r
 182         if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 183             return false;\r
 184         }\r
 185         if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 186             return false;\r
 187         }\r
 188         final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 189         return protmatcher.find();\r
 190     }\r
 191 \r
 192     /**\r
 193      * Check whether the sequence confirms to amboguous protein sequence\r
 194      * \r
 195      * @param sequence\r
 196      * @return return true only if the sequence if ambiguous protein sequence\r
 197      *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 198      *         protein or DNA\r
 199      */\r
 200     public static boolean isAmbiguosProtein(String sequence) {\r
 201         sequence = SequenceUtil.cleanSequence(sequence);\r
 202         if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 203             return false;\r
 204         }\r
 205         if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 206             return false;\r
 207         }\r
 208         if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 209             return false;\r
 210         }\r
 211         if (SequenceUtil.AA.matcher(sequence).find()) {\r
 212             return false;\r
 213         }\r
 214         final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 215         return amb_prot.find();\r
 216     }\r
 217 \r
 218     /**\r
 219      * Writes list of FastaSequeces into the outstream formatting the sequence\r
 220      * so that it contains width chars on each line\r
 221      * \r
 222      * @param outstream\r
 223      * @param sequences\r
 224      * @param width\r
 225      *            - the maximum number of characters to write in one line\r
 226      * @throws IOException\r
 227      */\r
 228     public static void writeFasta(final OutputStream outstream,\r
 229             final List<FastaSequence> sequences, final int width)\r
 230             throws IOException {\r
 231         final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 232         final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 233         for (final FastaSequence fs : sequences) {\r
 234             fastawriter.write(fs.getFormatedSequence(width));\r
 235         }\r
 236         outstream.flush();\r
 237         fastawriter.close();\r
 238         writer.close();\r
 239     }\r
 240 \r
 241     /**\r
 242      * Reads fasta sequences from inStream into the list of FastaSequence\r
 243      * objects\r
 244      * \r
 245      * @param inStream\r
 246      *            from\r
 247      * @return list of FastaSequence objects\r
 248      * @throws IOException\r
 249      */\r
 250     public static List<FastaSequence> readFasta(final InputStream inStream)\r
 251             throws IOException {\r
 252         final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 253 \r
 254         final BufferedReader infasta = new BufferedReader(\r
 255                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 256         final Pattern pattern = Pattern.compile("//s+");\r
 257 \r
 258         String line;\r
 259         String sname = "", seqstr = null;\r
 260         do {\r
 261             line = infasta.readLine();\r
 262             if ((line == null) || line.startsWith(">")) {\r
 263                 if (seqstr != null) {\r
 264                     seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
 265                 }\r
 266                 sname = line; // remove >\r
 267                 seqstr = "";\r
 268             } else {\r
 269                 final String subseq = pattern.matcher(line).replaceAll("");\r
 270                 seqstr += subseq;\r
 271             }\r
 272         } while (line != null);\r
 273 \r
 274         infasta.close();\r
 275         return seqs;\r
 276     }\r
 277 \r
 278     /**\r
 279      * Writes FastaSequence in the file, each sequence will take one line only\r
 280      * \r
 281      * @param os\r
 282      * @param sequences\r
 283      * @throws IOException\r
 284      */\r
 285     public static void writeFasta(final OutputStream os,\r
 286             final List<FastaSequence> sequences) throws IOException {\r
 287         final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 288         final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 289         for (final FastaSequence fs : sequences) {\r
 290             fasta_out.write(fs.getOnelineFasta());\r
 291         }\r
 292         fasta_out.close();\r
 293         outWriter.close();\r
 294     }\r
 295 \r
 296     public static List<AnnotatedSequence> readJRonn(final File result)\r
 297             throws IOException, UnknownFileFormatException {\r
 298         InputStream input = new FileInputStream(result);\r
 299         List<AnnotatedSequence> sequences = readJRonn(input);\r
 300         input.close();\r
 301         return sequences;\r
 302     }\r
 303 \r
 304     /**\r
 305      * Reader for JRonn horizontal file format\r
 306      * \r
 307      * >Foobar\r
 308      * \r
 309      * M G D T T A G\r
 310      * \r
 311      * 0.48 0.42 0.42 0.48 0.52 0.53 0.54\r
 312      * \r
 313      * All values are tab delimited\r
 314      * \r
 315      * @param inStream\r
 316      * @return\r
 317      * @throws IOException\r
 318      * @throws UnknownFileFormatException\r
 319      */\r
 320     public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
 321             throws IOException, UnknownFileFormatException {\r
 322         final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
 323 \r
 324         final BufferedReader infasta = new BufferedReader(\r
 325                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 326 \r
 327         String line;\r
 328         String sname = "";\r
 329         do {\r
 330             line = infasta.readLine();\r
 331             if (line == null || line.isEmpty()) {\r
 332                 // skip empty lines\r
 333                 continue;\r
 334             }\r
 335             if (line.startsWith(">")) {\r
 336                 // read name\r
 337                 sname = line.trim().substring(1);\r
 338                 // read sequence line\r
 339                 line = infasta.readLine();\r
 340                 final String sequence = line.replace("\t", "");\r
 341                 // read annotation line\r
 342                 line = infasta.readLine();\r
 343                 String[] annotValues = line.split("\t");\r
 344                 float[] annotation = convertToNumber(annotValues);\r
 345                 if (annotation.length != sequence.length()) {\r
 346                     throw new UnknownFileFormatException(\r
 347                             "File does not look like Jronn horizontally formatted output file!\n"\r
 348                                     + JRONN_WRONG_FORMAT_MESSAGE);\r
 349                 }\r
 350                 seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
 351             }\r
 352         } while (line != null);\r
 353 \r
 354         infasta.close();\r
 355         return seqs;\r
 356     }\r
 357 \r
 358     private static float[] convertToNumber(String[] annotValues)\r
 359             throws UnknownFileFormatException {\r
 360         float[] annotation = new float[annotValues.length];\r
 361         try {\r
 362             for (int i = 0; i < annotation.length; i++) {\r
 363                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 364             }\r
 365         } catch (NumberFormatException e) {\r
 366             throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE, e\r
 367                     .getCause());\r
 368         }\r
 369         return annotation;\r
 370     }\r
 371 \r
 372     private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 373             + ">sequence_name\n "\r
 374             + "M        V       S\n"\r
 375             + "0.43     0.22    0.65\n"\r
 376             + "Where first line is the sequence name,\n"\r
 377             + "second line is the tab delimited sequence,\n"\r
 378             + "third line contains tab delimited disorder prediction values.\n"\r
 379             + "No lines are allowed between these three. Additionally, the number of  "\r
 380             + "sequence residues must be equal to the number of the disorder values.";\r
 381 \r
 382     /**\r
 383      * Closes the Closable and logs the exception if any\r
 384      * \r
 385      * @param log\r
 386      * @param stream\r
 387      */\r
 388     public final static void closeSilently(java.util.logging.Logger log,\r
 389             Closeable stream) {\r
 390         if (stream != null) {\r
 391             try {\r
 392                 stream.close();\r
 393             } catch (IOException e) {\r
 394                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 395             }\r
 396         }\r
 397     }\r
 398 \r
 399 }\r