datamodel/compbio/data/sequence/SequenceUtil.java

   1 /*\r
   2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
   3  * Jalview Web Services version: 2.0 This library is free software; you can\r
   4  * redistribute it and/or modify it under the terms of the Apache License\r
   5  * version 2 as published by the Apache Software Foundation This library is\r
   6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
   7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
   8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
   9  * license is in apache_license.txt. It is also available here: see:\r
  10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
  11  * work distributed in source code form must include this copyright and license\r
  12  * notice.\r
  13  */\r
  14 \r
  15 package compbio.data.sequence;\r
  16 \r
  17 import java.io.BufferedReader;\r
  18 import java.io.BufferedWriter;\r
  19 import java.io.Closeable;\r
  20 import java.io.File;\r
  21 import java.io.FileInputStream;\r
  22 import java.io.IOException;\r
  23 import java.io.InputStream;\r
  24 import java.io.InputStreamReader;\r
  25 import java.io.OutputStream;\r
  26 import java.io.OutputStreamWriter;\r
  27 import java.util.ArrayList;\r
  28 import java.util.List;\r
  29 import java.util.Scanner;\r
  30 import java.util.logging.Level;\r
  31 import java.util.regex.Matcher;\r
  32 import java.util.regex.Pattern;\r
  33 \r
  34 /**\r
  35  * Utility class for operations on sequences\r
  36  * \r
  37  * @author Petr Troshin\r
  38  * @version 1.0\r
  39  */\r
  40 public final class SequenceUtil {\r
  41 \r
  42         /**\r
  43          * A whitespace character: [\t\n\x0B\f\r]\r
  44          */\r
  45         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  46 \r
  47         /**\r
  48          * A digit\r
  49          */\r
  50         public static final Pattern DIGIT = Pattern.compile("\\d");\r
  51 \r
  52         /**\r
  53          * Non word\r
  54          */\r
  55         public static final Pattern NONWORD = Pattern.compile("\\W");\r
  56 \r
  57         /**\r
  58          * Valid Amino acids\r
  59          */\r
  60         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  61                         Pattern.CASE_INSENSITIVE);\r
  62 \r
  63         /**\r
  64          * inversion of AA pattern\r
  65          */\r
  66         public static final Pattern NON_AA = Pattern.compile(\r
  67                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  68 \r
  69         /**\r
  70          * Same as AA pattern but with two additional letters - XU\r
  71          */\r
  72         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  73                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  74 \r
  75         /**\r
  76          * Nucleotides a, t, g, c, u\r
  77          */\r
  78         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  79                         Pattern.CASE_INSENSITIVE);\r
  80 \r
  81         /**\r
  82          * Ambiguous nucleotide\r
  83          */\r
  84         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
  85                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
  86         /**\r
  87          * Non nucleotide\r
  88          */\r
  89         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
  90                         Pattern.CASE_INSENSITIVE);\r
  91 \r
  92         private SequenceUtil() {\r
  93         } // utility class, no instantiation\r
  94 \r
  95         /*\r
  96          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
  97          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
  98          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
  99          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
 100          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
 101          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
 102          * BufferedWriter fasta_out = new BufferedWriter( new\r
 103          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
 104          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
 105          * SysPrefs.newlinechar); fasta_out.close(); }\r
 106          */\r
 107 \r
 108         /**\r
 109          * @return true is the sequence contains only letters a,c, t, g, u\r
 110          */\r
 111         public static boolean isNucleotideSequence(final FastaSequence s) {\r
 112                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 113         }\r
 114 \r
 115         /**\r
 116          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 117          * (!) - B char\r
 118          */\r
 119         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 120                 sequence = SequenceUtil.cleanSequence(sequence);\r
 121                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 122                         return false;\r
 123                 }\r
 124                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 125                         return false;\r
 126                         /*\r
 127                          * System.out.format("I found the text starting at " +\r
 128                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 129                          * nonDNAmatcher.end());\r
 130                          */\r
 131                 }\r
 132                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 133                 return DNAmatcher.find();\r
 134         }\r
 135 \r
 136         /**\r
 137          * Removes all whitespace chars in the sequence string\r
 138          * \r
 139          * @param sequence\r
 140          * @return cleaned up sequence\r
 141          */\r
 142         public static String cleanSequence(String sequence) {\r
 143                 assert sequence != null;\r
 144                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 145                 sequence = m.replaceAll("").toUpperCase();\r
 146                 return sequence;\r
 147         }\r
 148 \r
 149         /**\r
 150          * Removes all special characters and digits as well as whitespace chars\r
 151          * from the sequence\r
 152          * \r
 153          * @param sequence\r
 154          * @return cleaned up sequence\r
 155          */\r
 156         public static String deepCleanSequence(String sequence) {\r
 157                 sequence = SequenceUtil.cleanSequence(sequence);\r
 158                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 159                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 160                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 161                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 162                 return sequence;\r
 163         }\r
 164 \r
 165         /**\r
 166          * @param sequence\r
 167          * @return true is the sequence is a protein sequence, false overwise\r
 168          */\r
 169         public static boolean isProteinSequence(String sequence) {\r
 170                 sequence = SequenceUtil.cleanSequence(sequence);\r
 171                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 172                         return false;\r
 173                 }\r
 174                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 175                         return false;\r
 176                 }\r
 177                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 178                         return false;\r
 179                 }\r
 180                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 181                 return protmatcher.find();\r
 182         }\r
 183 \r
 184         /**\r
 185          * Check whether the sequence confirms to amboguous protein sequence\r
 186          * \r
 187          * @param sequence\r
 188          * @return return true only if the sequence if ambiguous protein sequence\r
 189          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 190          *         protein or DNA\r
 191          */\r
 192         public static boolean isAmbiguosProtein(String sequence) {\r
 193                 sequence = SequenceUtil.cleanSequence(sequence);\r
 194                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 195                         return false;\r
 196                 }\r
 197                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 198                         return false;\r
 199                 }\r
 200                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 201                         return false;\r
 202                 }\r
 203                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
 204                         return false;\r
 205                 }\r
 206                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 207                 return amb_prot.find();\r
 208         }\r
 209 \r
 210         /**\r
 211          * Writes list of FastaSequeces into the outstream formatting the sequence\r
 212          * so that it contains width chars on each line\r
 213          * \r
 214          * @param outstream\r
 215          * @param sequences\r
 216          * @param width\r
 217          *            - the maximum number of characters to write in one line\r
 218          * @throws IOException\r
 219          */\r
 220         public static void writeFasta(final OutputStream outstream,\r
 221                         final List<FastaSequence> sequences, final int width)\r
 222                         throws IOException {\r
 223                 writeFastaKeepTheStream(outstream, sequences, width);\r
 224                 outstream.close();\r
 225         }\r
 226 \r
 227         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
 228                         final List<FastaSequence> sequences, final int width)\r
 229                         throws IOException {\r
 230                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 231                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 232                 for (final FastaSequence fs : sequences) {\r
 233                         fastawriter.write(">" + fs.getId() + "\n");\r
 234                         fastawriter.write(fs.getFormatedSequence(width));\r
 235                         fastawriter.write("\n");\r
 236                 }\r
 237                 fastawriter.flush();\r
 238                 writer.flush();\r
 239         }\r
 240 \r
 241         /**\r
 242          * Reads fasta sequences from inStream into the list of FastaSequence\r
 243          * objects\r
 244          * \r
 245          * @param inStream\r
 246          *            from\r
 247          * @return list of FastaSequence objects\r
 248          * @throws IOException\r
 249          */\r
 250         public static List<FastaSequence> readFasta(final InputStream inStream)\r
 251                         throws IOException {\r
 252                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 253 \r
 254                 final BufferedReader infasta = new BufferedReader(\r
 255                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 256                 final Pattern pattern = Pattern.compile("//s+");\r
 257 \r
 258                 String line;\r
 259                 String sname = "", seqstr = null;\r
 260                 do {\r
 261                         line = infasta.readLine();\r
 262                         if ((line == null) || line.startsWith(">")) {\r
 263                                 if (seqstr != null) {\r
 264                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
 265                                 }\r
 266                                 sname = line; // remove >\r
 267                                 seqstr = "";\r
 268                         } else {\r
 269                                 final String subseq = pattern.matcher(line).replaceAll("");\r
 270                                 seqstr += subseq;\r
 271                         }\r
 272                 } while (line != null);\r
 273 \r
 274                 infasta.close();\r
 275                 return seqs;\r
 276         }\r
 277 \r
 278         /**\r
 279          * Writes FastaSequence in the file, each sequence will take one line only\r
 280          * \r
 281          * @param os\r
 282          * @param sequences\r
 283          * @throws IOException\r
 284          */\r
 285         public static void writeFasta(final OutputStream os,\r
 286                         final List<FastaSequence> sequences) throws IOException {\r
 287                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 288                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 289                 for (final FastaSequence fs : sequences) {\r
 290                         fasta_out.write(fs.getOnelineFasta());\r
 291                 }\r
 292                 fasta_out.close();\r
 293                 outWriter.close();\r
 294         }\r
 295 \r
 296         public static List<AnnotatedSequence> readJRonn(final File result)\r
 297                         throws IOException, UnknownFileFormatException {\r
 298                 InputStream input = new FileInputStream(result);\r
 299                 List<AnnotatedSequence> sequences = readJRonn(input);\r
 300                 input.close();\r
 301                 return sequences;\r
 302         }\r
 303 \r
 304         /**\r
 305          * Reader for JRonn horizontal file format >Foobar M G D T T A G 0.48 0.42\r
 306          * 0.42 0.48 0.52 0.53 0.54 All values are tab delimited\r
 307          * \r
 308          * @param inStream\r
 309          * @return\r
 310          * @throws IOException\r
 311          * @throws UnknownFileFormatException\r
 312          */\r
 313         public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
 314                         throws IOException, UnknownFileFormatException {\r
 315                 final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
 316 \r
 317                 final BufferedReader infasta = new BufferedReader(\r
 318                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 319 \r
 320                 String line;\r
 321                 String sname = "";\r
 322                 do {\r
 323                         line = infasta.readLine();\r
 324                         if (line == null || line.isEmpty()) {\r
 325                                 // skip empty lines\r
 326                                 continue;\r
 327                         }\r
 328                         if (line.startsWith(">")) {\r
 329                                 // read name\r
 330                                 sname = line.trim().substring(1);\r
 331                                 // read sequence line\r
 332                                 line = infasta.readLine();\r
 333                                 final String sequence = line.replace("\t", "");\r
 334                                 // read annotation line\r
 335                                 line = infasta.readLine();\r
 336                                 String[] annotValues = line.split("\t");\r
 337                                 float[] annotation = convertToNumber(annotValues);\r
 338                                 if (annotation.length != sequence.length()) {\r
 339                                         throw new UnknownFileFormatException(\r
 340                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
 341                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
 342                                 }\r
 343                                 seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
 344                         }\r
 345                 } while (line != null);\r
 346 \r
 347                 infasta.close();\r
 348                 return seqs;\r
 349         }\r
 350 \r
 351         private static float[] convertToNumber(String[] annotValues)\r
 352                         throws UnknownFileFormatException {\r
 353                 float[] annotation = new float[annotValues.length];\r
 354                 try {\r
 355                         for (int i = 0; i < annotation.length; i++) {\r
 356                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 357                         }\r
 358                 } catch (NumberFormatException e) {\r
 359                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
 360                                         e.getCause());\r
 361                 }\r
 362                 return annotation;\r
 363         }\r
 364 \r
 365         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 366                         + ">sequence_name\n "\r
 367                         + "M    V       S\n"\r
 368                         + "0.43 0.22    0.65\n"\r
 369                         + "Where first line is the sequence name,\n"\r
 370                         + "second line is the tab delimited sequence,\n"\r
 371                         + "third line contains tab delimited disorder prediction values.\n"\r
 372                         + "No lines are allowed between these three. Additionally, the number of  "\r
 373                         + "sequence residues must be equal to the number of the disorder values.";\r
 374 \r
 375         /**\r
 376          * Closes the Closable and logs the exception if any\r
 377          * \r
 378          * @param log\r
 379          * @param stream\r
 380          */\r
 381         public final static void closeSilently(java.util.logging.Logger log,\r
 382                         Closeable stream) {\r
 383                 if (stream != null) {\r
 384                         try {\r
 385                                 stream.close();\r
 386                         } catch (IOException e) {\r
 387                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 388                         }\r
 389                 }\r
 390         }\r
 391 \r
 392         /**\r
 393          * \r
 394          * TODO complete!\r
 395          * \r
 396          * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983\r
 397          * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512\r
 398          * 0.37094\r
 399          * \r
 400          * @param input\r
 401          * @return\r
 402          * @throws IOException\r
 403          * @throws UnknownFileFormatException\r
 404          */\r
 405         public static List<MultiAnnotatedSequence<DisemblResultAnnot>> readDisembl(\r
 406                         final InputStream input) throws IOException,\r
 407                         UnknownFileFormatException {\r
 408                 Scanner scan = new Scanner(input);\r
 409                 scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");\r
 410                 if (!scan.hasNext()) {\r
 411                         throw new UnknownFileFormatException(\r
 412                                         "In Disembl score format each seqeunce score is expected to start from the line: "\r
 413                                                         + "'# RESIDUE COILS REM465 HOTLOOPS\\n'."\r
 414                                                         + " No such line was found!");\r
 415                 }\r
 416 \r
 417                 List<MultiAnnotatedSequence<DisemblResultAnnot>> results = new ArrayList<MultiAnnotatedSequence<DisemblResultAnnot>>();\r
 418                 int seqCounter = 0;\r
 419                 while (scan.hasNext()) {\r
 420                         seqCounter++;\r
 421                         String singleSeq = scan.next();\r
 422                         Scanner scansingle = new Scanner(singleSeq);\r
 423                         StringBuffer seqbuffer = new StringBuffer();\r
 424                         List<Float> coils = new ArrayList<Float>();\r
 425                         List<Float> rem = new ArrayList<Float>();\r
 426                         List<Float> hotloops = new ArrayList<Float>();\r
 427 \r
 428                         MultiAnnotatedSequence<DisemblResultAnnot> disemblRes = new MultiAnnotatedSequence<DisemblResultAnnot>(\r
 429                                         DisemblResultAnnot.class);\r
 430 \r
 431                         while (scansingle.hasNextLine()) {\r
 432                                 String valueLine = scansingle.nextLine();\r
 433                                 Scanner values = new Scanner(valueLine);\r
 434                                 seqbuffer.append(values.next());\r
 435                                 coils.add(values.nextFloat());\r
 436                                 rem.add(values.nextFloat());\r
 437                                 hotloops.add(values.nextFloat());\r
 438                                 values.close();\r
 439                         }\r
 440                         disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils);\r
 441                         disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem);\r
 442                         disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops);\r
 443                         // TODO\r
 444                         // disemblRes.sequence = seqbuffer.toString();\r
 445                         scansingle.close();\r
 446                         results.add(disemblRes);\r
 447                 }\r
 448 \r
 449                 input.close();\r
 450                 return results;\r
 451         }\r
 452 \r
 453 }\r