datamodel/compbio/data/sequence/SequenceUtil.java

   1 /*\r
   2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
   3  * Jalview Web Services version: 2.0 This library is free software; you can\r
   4  * redistribute it and/or modify it under the terms of the Apache License\r
   5  * version 2 as published by the Apache Software Foundation This library is\r
   6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
   7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
   8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
   9  * license is in apache_license.txt. It is also available here: see:\r
  10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
  11  * work distributed in source code form must include this copyright and license\r
  12  * notice.\r
  13  */\r
  14 \r
  15 package compbio.data.sequence;\r
  16 \r
  17 import java.io.BufferedReader;\r
  18 import java.io.BufferedWriter;\r
  19 import java.io.Closeable;\r
  20 import java.io.File;\r
  21 import java.io.FileInputStream;\r
  22 import java.io.IOException;\r
  23 import java.io.InputStream;\r
  24 import java.io.InputStreamReader;\r
  25 import java.io.OutputStream;\r
  26 import java.io.OutputStreamWriter;\r
  27 import java.util.ArrayList;\r
  28 import java.util.List;\r
  29 import java.util.Scanner;\r
  30 import java.util.logging.Level;\r
  31 import java.util.regex.Matcher;\r
  32 import java.util.regex.Pattern;\r
  33 \r
  34 import compbio.conservation.Method;\r
  35 \r
  36 /**\r
  37  * Utility class for operations on sequences\r
  38  * \r
  39  * @author Petr Troshin\r
  40  * @version 1.0\r
  41  */\r
  42 public final class SequenceUtil {\r
  43 \r
  44         /**\r
  45          * A whitespace character: [\t\n\x0B\f\r]\r
  46          */\r
  47         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  48 \r
  49         /**\r
  50          * A digit\r
  51          */\r
  52         public static final Pattern DIGIT = Pattern.compile("\\d");\r
  53 \r
  54         /**\r
  55          * Non word\r
  56          */\r
  57         public static final Pattern NONWORD = Pattern.compile("\\W");\r
  58 \r
  59         /**\r
  60          * Valid Amino acids\r
  61          */\r
  62         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  63                         Pattern.CASE_INSENSITIVE);\r
  64 \r
  65         /**\r
  66          * inversion of AA pattern\r
  67          */\r
  68         public static final Pattern NON_AA = Pattern.compile(\r
  69                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  70 \r
  71         /**\r
  72          * Same as AA pattern but with two additional letters - XU\r
  73          */\r
  74         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  75                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  76 \r
  77         /**\r
  78          * Nucleotides a, t, g, c, u\r
  79          */\r
  80         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  81                         Pattern.CASE_INSENSITIVE);\r
  82 \r
  83         /**\r
  84          * Ambiguous nucleotide\r
  85          */\r
  86         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
  87                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
  88         /**\r
  89          * Non nucleotide\r
  90          */\r
  91         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
  92                         Pattern.CASE_INSENSITIVE);\r
  93 \r
  94         private SequenceUtil() {\r
  95         } // utility class, no instantiation\r
  96 \r
  97         /*\r
  98          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
  99          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
 100          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
 101          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
 102          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
 103          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
 104          * BufferedWriter fasta_out = new BufferedWriter( new\r
 105          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
 106          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
 107          * SysPrefs.newlinechar); fasta_out.close(); }\r
 108          */\r
 109 \r
 110         /**\r
 111          * @return true is the sequence contains only letters a,c, t, g, u\r
 112          */\r
 113         public static boolean isNucleotideSequence(final FastaSequence s) {\r
 114                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 115         }\r
 116 \r
 117         /**\r
 118          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 119          * (!) - B char\r
 120          */\r
 121         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 122                 sequence = SequenceUtil.cleanSequence(sequence);\r
 123                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 124                         return false;\r
 125                 }\r
 126                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 127                         return false;\r
 128                         /*\r
 129                          * System.out.format("I found the text starting at " +\r
 130                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 131                          * nonDNAmatcher.end());\r
 132                          */\r
 133                 }\r
 134                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 135                 return DNAmatcher.find();\r
 136         }\r
 137 \r
 138         /**\r
 139          * Removes all whitespace chars in the sequence string\r
 140          * \r
 141          * @param sequence\r
 142          * @return cleaned up sequence\r
 143          */\r
 144         public static String cleanSequence(String sequence) {\r
 145                 assert sequence != null;\r
 146                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 147                 sequence = m.replaceAll("").toUpperCase();\r
 148                 return sequence;\r
 149         }\r
 150 \r
 151         /**\r
 152          * Removes all special characters and digits as well as whitespace chars\r
 153          * from the sequence\r
 154          * \r
 155          * @param sequence\r
 156          * @return cleaned up sequence\r
 157          */\r
 158         public static String deepCleanSequence(String sequence) {\r
 159                 sequence = SequenceUtil.cleanSequence(sequence);\r
 160                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 161                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 162                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 163                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 164                 return sequence;\r
 165         }\r
 166 \r
 167         /**\r
 168          * @param sequence\r
 169          * @return true is the sequence is a protein sequence, false overwise\r
 170          */\r
 171         public static boolean isProteinSequence(String sequence) {\r
 172                 sequence = SequenceUtil.cleanSequence(sequence);\r
 173                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 174                         return false;\r
 175                 }\r
 176                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 177                         return false;\r
 178                 }\r
 179                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 180                         return false;\r
 181                 }\r
 182                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 183                 return protmatcher.find();\r
 184         }\r
 185 \r
 186         /**\r
 187          * Check whether the sequence confirms to amboguous protein sequence\r
 188          * \r
 189          * @param sequence\r
 190          * @return return true only if the sequence if ambiguous protein sequence\r
 191          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 192          *         protein or DNA\r
 193          */\r
 194         public static boolean isAmbiguosProtein(String sequence) {\r
 195                 sequence = SequenceUtil.cleanSequence(sequence);\r
 196                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 197                         return false;\r
 198                 }\r
 199                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 200                         return false;\r
 201                 }\r
 202                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 203                         return false;\r
 204                 }\r
 205                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
 206                         return false;\r
 207                 }\r
 208                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 209                 return amb_prot.find();\r
 210         }\r
 211 \r
 212         /**\r
 213          * Writes list of FastaSequeces into the outstream formatting the sequence\r
 214          * so that it contains width chars on each line\r
 215          * \r
 216          * @param outstream\r
 217          * @param sequences\r
 218          * @param width\r
 219          *            - the maximum number of characters to write in one line\r
 220          * @throws IOException\r
 221          */\r
 222         public static void writeFasta(final OutputStream outstream,\r
 223                         final List<FastaSequence> sequences, final int width)\r
 224                         throws IOException {\r
 225                 writeFastaKeepTheStream(outstream, sequences, width);\r
 226                 outstream.close();\r
 227         }\r
 228 \r
 229         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
 230                         final List<FastaSequence> sequences, final int width)\r
 231                         throws IOException {\r
 232                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 233                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 234                 for (final FastaSequence fs : sequences) {\r
 235                         fastawriter.write(">" + fs.getId() + "\n");\r
 236                         fastawriter.write(fs.getFormatedSequence(width));\r
 237                         fastawriter.write("\n");\r
 238                 }\r
 239                 fastawriter.flush();\r
 240                 writer.flush();\r
 241         }\r
 242 \r
 243         /**\r
 244          * Reads fasta sequences from inStream into the list of FastaSequence\r
 245          * objects\r
 246          * \r
 247          * @param inStream\r
 248          *            from\r
 249          * @return list of FastaSequence objects\r
 250          * @throws IOException\r
 251          */\r
 252         public static List<FastaSequence> readFasta(final InputStream inStream)\r
 253                         throws IOException {\r
 254                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 255 \r
 256                 final BufferedReader infasta = new BufferedReader(\r
 257                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 258                 final Pattern pattern = Pattern.compile("//s+");\r
 259 \r
 260                 String line;\r
 261                 String sname = "", seqstr = null;\r
 262                 do {\r
 263                         line = infasta.readLine();\r
 264                         if ((line == null) || line.startsWith(">")) {\r
 265                                 if (seqstr != null) {\r
 266                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
 267                                 }\r
 268                                 sname = line; // remove >\r
 269                                 seqstr = "";\r
 270                         } else {\r
 271                                 final String subseq = pattern.matcher(line).replaceAll("");\r
 272                                 seqstr += subseq;\r
 273                         }\r
 274                 } while (line != null);\r
 275 \r
 276                 infasta.close();\r
 277                 return seqs;\r
 278         }\r
 279 \r
 280         /**\r
 281          * Writes FastaSequence in the file, each sequence will take one line only\r
 282          * \r
 283          * @param os\r
 284          * @param sequences\r
 285          * @throws IOException\r
 286          */\r
 287         public static void writeFasta(final OutputStream os,\r
 288                         final List<FastaSequence> sequences) throws IOException {\r
 289                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 290                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 291                 for (final FastaSequence fs : sequences) {\r
 292                         fasta_out.write(fs.getOnelineFasta());\r
 293                 }\r
 294                 fasta_out.close();\r
 295                 outWriter.close();\r
 296         }\r
 297 \r
 298         public static List<AnnotatedSequence> readJRonn(final File result)\r
 299                         throws IOException, UnknownFileFormatException {\r
 300                 InputStream input = new FileInputStream(result);\r
 301                 List<AnnotatedSequence> sequences = readJRonn(input);\r
 302                 input.close();\r
 303                 return sequences;\r
 304         }\r
 305 \r
 306         /**\r
 307          * Reader for JRonn horizontal file format >Foobar M G D T T A G 0.48 0.42\r
 308          * 0.42 0.48 0.52 0.53 0.54 All values are tab delimited\r
 309          * \r
 310          * @param inStream\r
 311          * @return\r
 312          * @throws IOException\r
 313          * @throws UnknownFileFormatException\r
 314          */\r
 315         public static List<AnnotatedSequence> readJRonn(final InputStream inStream)\r
 316                         throws IOException, UnknownFileFormatException {\r
 317                 final List<AnnotatedSequence> seqs = new ArrayList<AnnotatedSequence>();\r
 318 \r
 319                 final BufferedReader infasta = new BufferedReader(\r
 320                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 321 \r
 322                 String line;\r
 323                 String sname = "";\r
 324                 do {\r
 325                         line = infasta.readLine();\r
 326                         if (line == null || line.isEmpty()) {\r
 327                                 // skip empty lines\r
 328                                 continue;\r
 329                         }\r
 330                         if (line.startsWith(">")) {\r
 331                                 // read name\r
 332                                 sname = line.trim().substring(1);\r
 333                                 // read sequence line\r
 334                                 line = infasta.readLine();\r
 335                                 final String sequence = line.replace("\t", "");\r
 336                                 // read annotation line\r
 337                                 line = infasta.readLine();\r
 338                                 String[] annotValues = line.split("\t");\r
 339                                 float[] annotation = convertToNumber(annotValues);\r
 340                                 if (annotation.length != sequence.length()) {\r
 341                                         throw new UnknownFileFormatException(\r
 342                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
 343                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
 344                                 }\r
 345                                 seqs.add(new AnnotatedSequence(sname, sequence, annotation));\r
 346                         }\r
 347                 } while (line != null);\r
 348 \r
 349                 infasta.close();\r
 350                 return seqs;\r
 351         }\r
 352 \r
 353         private static float[] convertToNumber(String[] annotValues)\r
 354                         throws UnknownFileFormatException {\r
 355                 float[] annotation = new float[annotValues.length];\r
 356                 try {\r
 357                         for (int i = 0; i < annotation.length; i++) {\r
 358                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 359                         }\r
 360                 } catch (NumberFormatException e) {\r
 361                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
 362                                         e.getCause());\r
 363                 }\r
 364                 return annotation;\r
 365         }\r
 366 \r
 367         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 368                         + ">sequence_name\n "\r
 369                         + "M    V       S\n"\r
 370                         + "0.43 0.22    0.65\n"\r
 371                         + "Where first line is the sequence name,\n"\r
 372                         + "second line is the tab delimited sequence,\n"\r
 373                         + "third line contains tab delimited disorder prediction values.\n"\r
 374                         + "No lines are allowed between these three. Additionally, the number of  "\r
 375                         + "sequence residues must be equal to the number of the disorder values.";\r
 376 \r
 377         /**\r
 378          * Closes the Closable and logs the exception if any\r
 379          * \r
 380          * @param log\r
 381          * @param stream\r
 382          */\r
 383         public final static void closeSilently(java.util.logging.Logger log,\r
 384                         Closeable stream) {\r
 385                 if (stream != null) {\r
 386                         try {\r
 387                                 stream.close();\r
 388                         } catch (IOException e) {\r
 389                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 390                         }\r
 391                 }\r
 392         }\r
 393 \r
 394         /**\r
 395          * \r
 396          * TODO complete!\r
 397          * \r
 398          * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983\r
 399          * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512\r
 400          * 0.37094\r
 401          * \r
 402          * @param input\r
 403          * @return\r
 404          * @throws IOException\r
 405          * @throws UnknownFileFormatException\r
 406          */\r
 407         public static List<MultiAnnotatedSequence<DisemblResultAnnot>> readDisembl(\r
 408                         final InputStream input) throws IOException,\r
 409                         UnknownFileFormatException {\r
 410                 Scanner scan = new Scanner(input);\r
 411                 scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n");\r
 412                 if (!scan.hasNext()) {\r
 413                         throw new UnknownFileFormatException(\r
 414                                         "In Disembl score format each seqeunce score is expected to start from the line: "\r
 415                                                         + "'# RESIDUE COILS REM465 HOTLOOPS\\n'."\r
 416                                                         + " No such line was found!");\r
 417                 }\r
 418 \r
 419                 List<MultiAnnotatedSequence<DisemblResultAnnot>> results = new ArrayList<MultiAnnotatedSequence<DisemblResultAnnot>>();\r
 420                 int seqCounter = 0;\r
 421                 while (scan.hasNext()) {\r
 422                         seqCounter++;\r
 423                         String singleSeq = scan.next();\r
 424                         Scanner scansingle = new Scanner(singleSeq);\r
 425                         StringBuffer seqbuffer = new StringBuffer();\r
 426                         ArrayList<Float> coils = new ArrayList<Float>();\r
 427                         ArrayList<Float> rem = new ArrayList<Float>();\r
 428                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
 429 \r
 430                         MultiAnnotatedSequence<DisemblResultAnnot> disemblRes = new MultiAnnotatedSequence<DisemblResultAnnot>(\r
 431                                         DisemblResultAnnot.class);\r
 432 \r
 433                         while (scansingle.hasNextLine()) {\r
 434                                 String valueLine = scansingle.nextLine();\r
 435                                 Scanner values = new Scanner(valueLine);\r
 436                                 seqbuffer.append(values.next());\r
 437                                 coils.add(values.nextFloat());\r
 438                                 rem.add(values.nextFloat());\r
 439                                 hotloops.add(values.nextFloat());\r
 440                                 values.close();\r
 441                         }\r
 442                         disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils);\r
 443                         disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem);\r
 444                         disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops);\r
 445                         // TODO\r
 446                         // disemblRes.sequence = seqbuffer.toString();\r
 447                         scansingle.close();\r
 448                         results.add(disemblRes);\r
 449                 }\r
 450 \r
 451                 input.close();\r
 452                 return results;\r
 453         }\r
 454 \r
 455         /**\r
 456          * Read AACon result with no alignment files. This method leaves incoming\r
 457          * the InputStream results open!\r
 458          * \r
 459          * @param results\r
 460          *            output file of AAConservation\r
 461          * @return {@link MultiAnnotatedSequence}\r
 462          */\r
 463         public static MultiAnnotatedSequence<Method> readResults(InputStream results) {\r
 464                 if (results == null) {\r
 465                         throw new NullPointerException(\r
 466                                         "InputStream with results must be provided");\r
 467                 }\r
 468                 MultiAnnotatedSequence<Method> annotations = new MultiAnnotatedSequence<Method>(\r
 469                                 Method.class);\r
 470                 Scanner sc = new Scanner(results);\r
 471                 sc.useDelimiter("#");\r
 472                 while (sc.hasNext()) {\r
 473                         String line = sc.next();\r
 474                         int spacePos = line.indexOf(" ");\r
 475                         assert spacePos > 0 : "Space is expected as delimited between method "\r
 476                                         + "name and values!";\r
 477                         String methodLine = line.substring(0, spacePos);\r
 478                         Method method = Method.getMethod(methodLine);\r
 479                         assert method != null : "Method " + methodLine\r
 480                                         + " is not recognized! ";\r
 481                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
 482                         ArrayList<Float> values = new ArrayList<Float>();\r
 483                         while (valuesScanner.hasNextDouble()) {\r
 484                                 Double value = valuesScanner.nextDouble();\r
 485                                 values.add(value.floatValue());\r
 486                         }\r
 487                         annotations.addAnnotation(method, values);\r
 488                 }\r
 489                 return annotations;\r
 490         }\r
 491 \r
 492 }\r