datamodel/compbio/data/sequence/SequenceUtil.java

   1 /* Copyright (c) 2011 Peter Troshin\r
   2  * Copyright (c) 2013 Alexander Sherstnev\r
   3  *  \r
   4  *  JAva Bioinformatics Analysis Web Services (JABAWS)\r
   5  *  @version: 2.5     \r
   6  * \r
   7  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
   8  *  Apache License version 2 as published by the Apache Software Foundation\r
   9  * \r
  10  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
  11  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
  12  *  License for more details.\r
  13  * \r
  14  *  A copy of the license is in apache_license.txt. It is also available here:\r
  15  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
  16  * \r
  17  * Any republication or derived work distributed in source code form\r
  18  * must include this copyright and license notice.\r
  19  */\r
  20 \r
  21 package compbio.data.sequence;\r
  22 \r
  23 import java.io.BufferedReader;\r
  24 import java.io.BufferedWriter;\r
  25 import java.io.Closeable;\r
  26 import java.io.File;\r
  27 import java.io.FileInputStream;\r
  28 import java.io.FileNotFoundException;\r
  29 import java.io.IOException;\r
  30 import java.io.InputStream;\r
  31 import java.io.InputStreamReader;\r
  32 import java.io.OutputStream;\r
  33 import java.io.OutputStreamWriter;\r
  34 import java.util.ArrayList;\r
  35 import java.util.Arrays;\r
  36 import java.util.HashMap;\r
  37 import java.util.Collections;\r
  38 import java.util.HashSet;\r
  39 import java.util.List;\r
  40 import java.util.Map;\r
  41 import java.util.Scanner;\r
  42 import java.util.Set;\r
  43 import java.util.TreeMap;\r
  44 import java.util.TreeSet;\r
  45 import java.util.logging.Level;\r
  46 import java.util.regex.Matcher;\r
  47 import java.util.regex.Pattern;\r
  48 \r
  49 import compbio.util.Util;\r
  50 \r
  51 /**\r
  52  * Utility class for operations on sequences\r
  53  * \r
  54  * @author Peter Troshin\r
  55  * @since 1.0\r
  56  * @version 2.0 June 2011\r
  57  */\r
  58 public final class SequenceUtil {\r
  59 \r
  60         /**\r
  61          * A whitespace character: [\t\n\x0B\f\r]\r
  62          */\r
  63         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  64 \r
  65         /**\r
  66          * A digit\r
  67          */\r
  68         public static final Pattern DIGIT = Pattern.compile("\\d");\r
  69 \r
  70         /**\r
  71          * Non word\r
  72          */\r
  73         public static final Pattern NONWORD = Pattern.compile("\\W");\r
  74 \r
  75         /**\r
  76          * Valid Amino acids\r
  77          */\r
  78         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  79                         Pattern.CASE_INSENSITIVE);\r
  80 \r
  81         /**\r
  82          * inversion of AA pattern\r
  83          */\r
  84         public static final Pattern NON_AA = Pattern.compile(\r
  85                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  86 \r
  87         /**\r
  88          * Same as AA pattern but with two additional letters - XU\r
  89          */\r
  90         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  91                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  92 \r
  93         /**\r
  94          * Nucleotides a, t, g, c, u\r
  95          */\r
  96         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  97                         Pattern.CASE_INSENSITIVE);\r
  98 \r
  99         /**\r
 100          * Ambiguous nucleotide\r
 101          */\r
 102         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
 103                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
 104         /**\r
 105          * Non nucleotide\r
 106          */\r
 107         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
 108                         Pattern.CASE_INSENSITIVE);\r
 109 \r
 110         private SequenceUtil() {\r
 111         } // utility class, no instantiation\r
 112 \r
 113         /**\r
 114          * @return true is the sequence contains only letters a,c, t, g, u\r
 115          */\r
 116         public static boolean isNucleotideSequence(final FastaSequence s) {\r
 117                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 118         }\r
 119 \r
 120         /**\r
 121          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 122          * (!) - B char\r
 123          */\r
 124         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 125                 sequence = SequenceUtil.cleanSequence(sequence);\r
 126                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 127                         return false;\r
 128                 }\r
 129                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 130                         return false;\r
 131                         /*\r
 132                          * System.out.format("I found the text starting at " +\r
 133                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 134                          * nonDNAmatcher.end());\r
 135                          */\r
 136                 }\r
 137                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 138                 return DNAmatcher.find();\r
 139         }\r
 140 \r
 141         /**\r
 142          * Removes all whitespace chars in the sequence string\r
 143          * \r
 144          * @param sequence\r
 145          * @return cleaned up sequence\r
 146          */\r
 147         public static String cleanSequence(String sequence) {\r
 148                 assert sequence != null;\r
 149                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 150                 sequence = m.replaceAll("").toUpperCase();\r
 151                 return sequence;\r
 152         }\r
 153 \r
 154         /**\r
 155          * Removes all special characters and digits as well as whitespace chars\r
 156          * from the sequence\r
 157          * \r
 158          * @param sequence\r
 159          * @return cleaned up sequence\r
 160          */\r
 161         public static String deepCleanSequence(String sequence) {\r
 162                 sequence = SequenceUtil.cleanSequence(sequence);\r
 163                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 164                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 165                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 166                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 167                 return sequence;\r
 168         }\r
 169 \r
 170         /**\r
 171          * Remove all non AA chars from the sequence\r
 172          * \r
 173          * @param sequence\r
 174          *            the sequence to clean\r
 175          * @return cleaned sequence\r
 176          */\r
 177         public static String cleanProteinSequence(String sequence) {\r
 178                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
 179         }\r
 180 \r
 181         /**\r
 182          * @param sequence\r
 183          * @return true is the sequence is a protein sequence, false overwise\r
 184          */\r
 185         public static boolean isProteinSequence(String sequence) {\r
 186                 sequence = SequenceUtil.cleanSequence(sequence);\r
 187                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 188                         return false;\r
 189                 }\r
 190                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 191                         return false;\r
 192                 }\r
 193                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 194                         return false;\r
 195                 }\r
 196                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 197                 return protmatcher.find();\r
 198         }\r
 199 \r
 200         /**\r
 201          * Check whether the sequence confirms to amboguous protein sequence\r
 202          * \r
 203          * @param sequence\r
 204          * @return return true only if the sequence if ambiguous protein sequence\r
 205          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 206          *         protein or DNA\r
 207          */\r
 208         public static boolean isAmbiguosProtein(String sequence) {\r
 209                 sequence = SequenceUtil.cleanSequence(sequence);\r
 210                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 211                         return false;\r
 212                 }\r
 213                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 214                         return false;\r
 215                 }\r
 216                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 217                         return false;\r
 218                 }\r
 219                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
 220                         return false;\r
 221                 }\r
 222                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 223                 return amb_prot.find();\r
 224         }\r
 225 \r
 226         /**\r
 227          * Writes list of FastaSequeces into the outstream formatting the sequence\r
 228          * so that it contains width chars on each line\r
 229          * \r
 230          * @param outstream\r
 231          * @param sequences\r
 232          * @param width\r
 233          *            - the maximum number of characters to write in one line\r
 234          * @throws IOException\r
 235          */\r
 236         public static void writeFasta(final OutputStream outstream,\r
 237                         final List<FastaSequence> sequences, final int width)\r
 238                         throws IOException {\r
 239                 writeFastaKeepTheStream(outstream, sequences, width);\r
 240                 outstream.close();\r
 241         }\r
 242 \r
 243         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
 244                         final List<FastaSequence> sequences, final int width)\r
 245                         throws IOException {\r
 246                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 247                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 248                 for (final FastaSequence fs : sequences) {\r
 249                         fastawriter.write(">" + fs.getId() + "\n");\r
 250                         fastawriter.write(fs.getFormatedSequence(width));\r
 251                         fastawriter.write("\n");\r
 252                 }\r
 253                 fastawriter.flush();\r
 254                 writer.flush();\r
 255         }\r
 256 \r
 257         /**\r
 258          * Reads fasta sequences from inStream into the list of FastaSequence\r
 259          * objects\r
 260          * \r
 261          * @param inStream\r
 262          *            from\r
 263          * @return list of FastaSequence objects\r
 264          * @throws IOException\r
 265          */\r
 266         public static List<FastaSequence> readFasta(final InputStream inStream)\r
 267                         throws IOException {\r
 268                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 269                 FastaReader reader = new FastaReader(inStream);\r
 270                 while (reader.hasNext()) {\r
 271                         seqs.add(reader.next());\r
 272                 }\r
 273                 inStream.close();\r
 274                 return seqs;\r
 275         }\r
 276 \r
 277         /**\r
 278          * Writes FastaSequence in the file, each sequence will take one line only\r
 279          * \r
 280          * @param os\r
 281          * @param sequences\r
 282          * @throws IOException\r
 283          */\r
 284         public static void writeFasta(final OutputStream os,\r
 285                         final List<FastaSequence> sequences) throws IOException {\r
 286                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 287                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 288                 for (final FastaSequence fs : sequences) {\r
 289                         fasta_out.write(fs.getOnelineFasta());\r
 290                 }\r
 291                 fasta_out.close();\r
 292                 outWriter.close();\r
 293         }\r
 294 \r
 295         \r
 296         public static final List<FastaSequence> readJpredFile(InputStream result)\r
 297                         throws IOException, FileNotFoundException,NullPointerException {\r
 298                 return readFasta (result);\r
 299         }\r
 300         \r
 301         /**\r
 302          * Read IUPred output\r
 303          * \r
 304          * @param result\r
 305          * @return Map key->sequence name, value->Score\r
 306          * @throws IOException\r
 307          * @throws UnknownFileFormatException\r
 308          */\r
 309         public static Map<String, Score> readIUPred(final File result)\r
 310                         throws IOException, UnknownFileFormatException {\r
 311                 InputStream input = new FileInputStream(result);\r
 312                 Map<String, Score> sequences = readIUPred(input,\r
 313                                 IUPredResult.getType(result));\r
 314                 input.close(); \r
 315                 return sequences;\r
 316         }\r
 317 \r
 318         // Check the type of the file e.g. long| short or domain\r
 319         // and read\r
 320         /**\r
 321          * ## Long Disorder\r
 322          * \r
 323          * # P53_HUMAN\r
 324          * \r
 325          * 1 M 0.9943\r
 326          * \r
 327          * 2 E 0.9917\r
 328          * \r
 329          * 3 E 0.9879\r
 330          * \r
 331          * (every line)\r
 332          * \r
 333          * @throws IOException\r
 334          * @throws UnknownFileFormatException\r
 335          * \r
 336          * \r
 337          */\r
 338         private static Map<String, Score> readIUPred(InputStream input,\r
 339                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
 340 \r
 341                 Score score = null;\r
 342                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 343                 Scanner scan = new Scanner(input);\r
 344                 scan.useDelimiter("#");\r
 345                 while (scan.hasNext()) {\r
 346                         String nextEntry = scan.next();\r
 347                         Scanner entry = new Scanner(nextEntry);\r
 348                         String name = entry.nextLine().trim();\r
 349                         // inside entry:\r
 350                         if (IUPredResult.Glob == type) {\r
 351                                 // parse domains\r
 352                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
 353                                 score = new Score(type, ranges);\r
 354                         } else {\r
 355                                 // parse short | long\r
 356                                 float[] scores = parseIUPredScores(entry);\r
 357                                 score = new Score(type, scores);\r
 358                         }\r
 359                         entry.close();\r
 360                         seqs.put(name, score);\r
 361                 }\r
 362 \r
 363                 scan.close();\r
 364                 return seqs;\r
 365         }\r
 366 \r
 367         /**\r
 368          * # P53_HUMA\r
 369          * \r
 370          * Number of globular domains: 2\r
 371          * \r
 372          * globular domain 1. 98 - 269\r
 373          * \r
 374          * globular domain 2. 431 - 482\r
 375          * \r
 376          * >P53_HUMA\r
 377          * \r
 378          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
 379          * \r
 380          * @param scan\r
 381          */\r
 382         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
 383                 String header = "Number of globular domains:";\r
 384                 String domainPref = "globular domain";\r
 385                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 386                 String line = scan.nextLine().trim();\r
 387                 assert line.startsWith(header);\r
 388                 line = line.substring(header.length()).trim();\r
 389                 int domainNum = Integer.parseInt(line);\r
 390                 if (domainNum == 0) {\r
 391                         return ranges;\r
 392                 }\r
 393 \r
 394                 for (int i = 0; i < domainNum; i++) {\r
 395                         assert scan.hasNextLine();\r
 396                         line = scan.nextLine();\r
 397                         assert line.trim().startsWith(domainPref);\r
 398                         line = line.substring(line.indexOf(".") + 1).trim();\r
 399                         Range r = new Range(line.split("-"));\r
 400                         ranges.add(r);\r
 401                 }\r
 402 \r
 403                 return ranges;\r
 404         }\r
 405         /*\r
 406          * 1 M 0.9943\r
 407          * \r
 408          * 2 E 0.9917\r
 409          */\r
 410         private static float[] parseIUPredScores(Scanner scan)\r
 411                         throws UnknownFileFormatException {\r
 412                 List<String> annotation = new ArrayList<String>();\r
 413                 while (scan.hasNextLine()) {\r
 414                         String line = scan.nextLine().trim();\r
 415                         String[] val = line.split("\\s+");\r
 416                         annotation.add(val[2]);\r
 417                 }\r
 418                 return convertToNumber(annotation\r
 419                                 .toArray(new String[annotation.size()]));\r
 420         }\r
 421 \r
 422         public static Map<String, Score> readJRonn(final File result)\r
 423                         throws IOException, UnknownFileFormatException {\r
 424                 InputStream input = new FileInputStream(result);\r
 425                 Map<String, Score> sequences = readJRonn(input);\r
 426                 input.close();\r
 427                 return sequences;\r
 428         }\r
 429 \r
 430         /**\r
 431          * Reader for JRonn horizontal file format\r
 432          * \r
 433          * <pre>\r
 434          * &gtFoobar M G D T T A G 0.48 0.42\r
 435          * 0.42 0.48 0.52 0.53 0.54\r
 436          * \r
 437          * <pre>\r
 438          * Where all values are tab delimited\r
 439          * \r
 440          * @param inStream\r
 441          *            the InputStream connected to the JRonn output file\r
 442          * @return Map key=sequence name value=Score\r
 443          * @throws IOException\r
 444          *             is thrown if the inStream has problems accessing the data\r
 445          * @throws UnknownFileFormatException\r
 446          *             is thrown if the inStream represents an unknown source of\r
 447          * data, i.e. not a JRonn output\r
 448          */\r
 449         public static Map<String, Score> readJRonn(final InputStream inStream)\r
 450                         throws IOException, UnknownFileFormatException {\r
 451                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 452 \r
 453                 final BufferedReader infasta = new BufferedReader(\r
 454                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 455 \r
 456                 String line;\r
 457                 String sname = "";\r
 458                 do {\r
 459                         line = infasta.readLine();\r
 460                         if (line == null || line.isEmpty()) {\r
 461                                 // skip empty lines\r
 462                                 continue;\r
 463                         }\r
 464                         if (line.startsWith(">")) {\r
 465                                 // read name\r
 466                                 sname = line.trim().substring(1);\r
 467                                 // read sequence line\r
 468                                 line = infasta.readLine();\r
 469                                 final String sequence = line.replace("\t", "");\r
 470                                 // read annotation line\r
 471                                 line = infasta.readLine();\r
 472                                 String[] annotValues = line.split("\t");\r
 473                                 float[] annotation = convertToNumber(annotValues);\r
 474                                 if (annotation.length != sequence.length()) {\r
 475                                         throw new UnknownFileFormatException(\r
 476                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
 477                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
 478                                 }\r
 479                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
 480                         }\r
 481                 } while (line != null);\r
 482 \r
 483                 infasta.close();\r
 484                 return seqs;\r
 485         }\r
 486 \r
 487         private static float[] convertToNumber(String[] annotValues)\r
 488                         throws UnknownFileFormatException {\r
 489                 float[] annotation = new float[annotValues.length];\r
 490                 try {\r
 491                         for (int i = 0; i < annotation.length; i++) {\r
 492                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 493                         }\r
 494                 } catch (NumberFormatException e) {\r
 495                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
 496                                         e.getCause());\r
 497                 }\r
 498                 return annotation;\r
 499         }\r
 500 \r
 501         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 502                         + ">sequence_name\n "\r
 503                         + "M    V       S\n"\r
 504                         + "0.43 0.22    0.65\n"\r
 505                         + "Where first line is the sequence name,\n"\r
 506                         + "second line is the tab delimited sequence,\n"\r
 507                         + "third line contains tab delimited disorder prediction values.\n"\r
 508                         + "No lines are allowed between these three. Additionally, the number of  "\r
 509                         + "sequence residues must be equal to the number of the disorder values.";\r
 510 \r
 511         /**\r
 512          * Closes the Closable and logs the exception if any\r
 513          * \r
 514          * @param log\r
 515          * @param stream\r
 516          */\r
 517         public final static void closeSilently(java.util.logging.Logger log,\r
 518                         Closeable stream) {\r
 519                 if (stream != null) {\r
 520                         try {\r
 521                                 stream.close();\r
 522                         } catch (IOException e) {\r
 523                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 524                         }\r
 525                 }\r
 526         }\r
 527 \r
 528         /**\r
 529          * \r
 530          > Foobar_dundeefriends\r
 531          * \r
 532          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 533          * \r
 534          * # REM465 355-368\r
 535          * \r
 536          * # HOTLOOPS 190-204\r
 537          * \r
 538          * # RESIDUE COILS REM465 HOTLOOPS\r
 539          * \r
 540          * M 0.86010 0.88512 0.37094\r
 541          * \r
 542          * T 0.79983 0.85864 0.44331\r
 543          * \r
 544          * >Next Sequence name\r
 545          * \r
 546          * \r
 547          * @param input\r
 548          *            the InputStream\r
 549          * @return Map key=sequence name, value=set of score\r
 550          * @throws IOException\r
 551          * @throws UnknownFileFormatException\r
 552          */\r
 553         public static HashMap<String, Set<Score>> readDisembl(\r
 554                         final InputStream input) throws IOException,\r
 555                         UnknownFileFormatException {\r
 556                 Scanner scan = new Scanner(input);\r
 557                 scan.useDelimiter(">");\r
 558                 if (!scan.hasNext()) {\r
 559                         throw new UnknownFileFormatException(\r
 560                                         "In Disembl score format each sequence score is expected "\r
 561                                                         + "to start from the line: >Sequence name "\r
 562                                                         + " No such line was found!");\r
 563                 }\r
 564 \r
 565                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 566                 int seqCounter = 0;\r
 567                 while (scan.hasNext()) {\r
 568                         seqCounter++;\r
 569                         String singleSeq = scan.next();\r
 570                         Scanner scansingle = new Scanner(singleSeq);\r
 571                         if (!scansingle.hasNextLine()) {\r
 572                                 throw new RuntimeException(\r
 573                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
 574                         }\r
 575 \r
 576                         StringBuffer seqbuffer = new StringBuffer();\r
 577                         ArrayList<Float> coils = new ArrayList<Float>();\r
 578                         ArrayList<Float> rem = new ArrayList<Float>();\r
 579                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
 580 \r
 581                         String sequenceName = scansingle.nextLine().trim();\r
 582                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
 583                                         scansingle.nextLine());\r
 584                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
 585                                         scansingle.nextLine());\r
 586                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
 587                                         scansingle.nextLine());\r
 588 \r
 589                         String title = scansingle.nextLine();\r
 590                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
 591 \r
 592                         while (scansingle.hasNext()) {\r
 593                                 seqbuffer.append(scansingle.next());\r
 594                                 coils.add(scansingle.nextFloat());\r
 595                                 rem.add(scansingle.nextFloat());\r
 596                                 hotloops.add(scansingle.nextFloat());\r
 597                         }\r
 598                         /*\r
 599                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 600                          * seqbuffer.toString());\r
 601                          */\r
 602                         HashSet<Score> scores = new HashSet<Score>();\r
 603                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
 604                         scores.add(new Score(DisemblResult.REM465, rem, rem465R));\r
 605                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, loopsR));\r
 606                         results.put(sequenceName, scores);\r
 607 \r
 608                         scansingle.close();\r
 609                 }\r
 610                 scan.close();\r
 611                 input.close();\r
 612                 return results;\r
 613         }\r
 614 \r
 615         /**\r
 616          * Parsing:\r
 617          * \r
 618          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
 619          * 350-391, 429-485, 497-506, 539-547\r
 620          * \r
 621          * # REM465 355-368\r
 622          * \r
 623          * # HOTLOOPS 190-204\r
 624          * \r
 625          * @param lines\r
 626          * @return\r
 627          */\r
 628         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
 629                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 630 \r
 631                 Scanner scan = new Scanner(lines);\r
 632 \r
 633                 assert scan.hasNext();\r
 634                 String del = scan.next();\r
 635                 assert "#".equals(del); // pass delimiter #\r
 636                 String type = scan.next(); // pass enum name e.g. COILS\r
 637                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
 638                                 + resultType.toString();\r
 639 \r
 640                 // beginning of the ranges\r
 641                 scan.useDelimiter(",");\r
 642                 while (scan.hasNext()) {\r
 643                         String range = scan.next();\r
 644                         if (!Util.isEmpty(range)) {\r
 645                                 ranges.add(new Range(range.split("-")));\r
 646                         }\r
 647                 }\r
 648                 return ranges;\r
 649         }\r
 650 \r
 651         /**\r
 652          * \r
 653          > Foobar_dundeefriends\r
 654          * \r
 655          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 656          * \r
 657          * # REM465 355-368\r
 658          * \r
 659          * # HOTLOOPS 190-204\r
 660          * \r
 661          * # RESIDUE COILS REM465 HOTLOOPS\r
 662          * \r
 663          * M 0.86010 0.88512 0.37094\r
 664          * \r
 665          * T 0.79983 0.85864 0.44331\r
 666          * \r
 667          * >Next Sequence name\r
 668          * \r
 669          * \r
 670          * @param input\r
 671          * @return Map key=sequence name, value=set of score\r
 672          * @throws IOException\r
 673          * @throws UnknownFileFormatException\r
 674          */\r
 675         public static HashMap<String, Set<Score>> readGlobPlot(\r
 676                         final InputStream input) throws IOException,\r
 677                         UnknownFileFormatException {\r
 678                 Scanner scan = new Scanner(input);\r
 679                 scan.useDelimiter(">");\r
 680                 if (!scan.hasNext()) {\r
 681                         throw new UnknownFileFormatException(\r
 682                                         "In GlobPlot score format each sequence score is expected "\r
 683                                                         + "to start from the line: >Sequence name "\r
 684                                                         + " No such line was found!");\r
 685                 }\r
 686 \r
 687                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 688                 int seqCounter = 0;\r
 689                 while (scan.hasNext()) {\r
 690                         seqCounter++;\r
 691                         String singleSeq = scan.next();\r
 692                         Scanner scansingle = new Scanner(singleSeq);\r
 693                         if (!scansingle.hasNextLine()) {\r
 694                                 throw new RuntimeException(\r
 695                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
 696                         }\r
 697 \r
 698                         StringBuffer seqbuffer = new StringBuffer();\r
 699                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
 700                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
 701                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
 702 \r
 703                         String sequenceName = scansingle.nextLine().trim();\r
 704                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
 705                                         scansingle.nextLine());\r
 706                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
 707                                         scansingle.nextLine());\r
 708 \r
 709                         String title = scansingle.nextLine();\r
 710                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
 711 \r
 712                         while (scansingle.hasNext()) {\r
 713                                 seqbuffer.append(scansingle.next());\r
 714                                 dydxScore.add(scansingle.nextFloat());\r
 715                                 rawScore.add(scansingle.nextFloat());\r
 716                                 smoothedScore.add(scansingle.nextFloat());\r
 717                         }\r
 718                         /*\r
 719                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 720                          * seqbuffer.toString());\r
 721                          */\r
 722                         Set<Score> scores = new TreeSet<Score>();\r
 723                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
 724                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
 725                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
 726                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
 727                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
 728                         results.put(sequenceName, scores);\r
 729 \r
 730                         scansingle.close();\r
 731                 }\r
 732                 scan.close();\r
 733                 input.close();\r
 734                 return results;\r
 735         }\r
 736         /**\r
 737          * Read AACon result with no alignment files. This method leaves incoming\r
 738          * InputStream open!\r
 739          * \r
 740          * @param results\r
 741          *            output file of AAConservation\r
 742          * @return Map with keys {@link ConservationMethod} -> float[]\r
 743          */\r
 744         public static HashSet<Score> readAAConResults(InputStream results) {\r
 745                 if (results == null) {\r
 746                         throw new NullPointerException(\r
 747                                         "InputStream with results must be provided");\r
 748                 }\r
 749                 HashSet<Score> annotations = new HashSet<Score>();\r
 750                 Scanner sc = new Scanner(results);\r
 751                 sc.useDelimiter("#");\r
 752                 while (sc.hasNext()) {\r
 753                         String line = sc.next();\r
 754                         int spacePos = line.indexOf(" ");\r
 755                         assert spacePos > 0 : "Space is expected as delimited between method "\r
 756                                         + "name and values!";\r
 757                         String methodLine = line.substring(0, spacePos);\r
 758                         ConservationMethod method = ConservationMethod\r
 759                                         .getMethod(methodLine);\r
 760                         assert method != null : "Method " + methodLine\r
 761                                         + " is not recognized! ";\r
 762                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
 763                         ArrayList<Float> values = new ArrayList<Float>();\r
 764                         while (valuesScanner.hasNextDouble()) {\r
 765                                 Double value = valuesScanner.nextDouble();\r
 766                                 values.add(value.floatValue());\r
 767                         }\r
 768                         annotations.add(new Score(method, values));\r
 769                 }\r
 770                 return annotations;\r
 771         }\r
 772 \r
 773         \r
 774         \r
 775 \r
 776         /**\r
 777          * Reads and parses Fasta or Clustal formatted file into a list of\r
 778          * FastaSequence objects\r
 779          * \r
 780          * @param inFilePath\r
 781          *            the path to the input file\r
 782          * @throws IOException\r
 783          *             if the file denoted by inFilePath cannot be read\r
 784          * @throws UnknownFileFormatException\r
 785          *             if the inFilePath points to the file which format cannot be\r
 786          *             recognised\r
 787          * @return the List of FastaSequence objects\r
 788          * \r
 789          */\r
 790         public static List<FastaSequence> openInputStream(String inFilePath)\r
 791                         throws IOException, UnknownFileFormatException {\r
 792 \r
 793                 // This stream gets closed in isValidClustalFile method\r
 794                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
 795                 // This stream is closed in the calling methods\r
 796                 InputStream inStr = new FileInputStream(inFilePath);\r
 797                 List<FastaSequence> fastaSeqs = null;\r
 798                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
 799                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
 800                         // alignment cannot be null see\r
 801                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
 802                         fastaSeqs = al.getSequences();\r
 803                 } else {\r
 804                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
 805                 }\r
 806                 return fastaSeqs;\r
 807         }\r
 808 \r
 809         // This can't possibly be right for all cases!\r
 810         // but it will do for now\r
 811         \r
 812         // As for the metadata. This function doesnt know what program\r
 813         // generated it. How to handle the metadata!?\r
 814         \r
 815         public static void writeClustal(OutputStream outStream,\r
 816                         List<FastaSequence> sequences, char gapChar) \r
 817                         throws IOException {\r
 818                 \r
 819                 BufferedWriter writer = new BufferedWriter(\r
 820                                 new OutputStreamWriter(outStream));\r
 821                 // will give AlignmentMetadata default type of CLUSTAL for now\r
 822                 AlignmentMetadata al = new AlignmentMetadata(Program.CLUSTAL, gapChar);\r
 823                 \r
 824                 ClustalAlignmentUtil.writeClustalAlignment(writer, \r
 825                                 new Alignment(sequences, al));\r
 826                 \r
 827         }\r
 828 \r
 829 }\r
 830 \r
 831 enum DisemblResult {\r
 832         /** These contains ranges and scores */\r
 833         COILS, REM465, HOTLOOPS\r
 834 }\r
 835 enum GlobProtResult {\r
 836         /** This a range with no scores */\r
 837         GlobDoms,\r
 838         /** This a range with no scores */\r
 839         Disorder,\r
 840         /** This a score with no range */\r
 841         Dydx,\r
 842         /** This a score with no range */\r
 843         SmoothedScore,\r
 844         /** This a score with no range */\r
 845         RawScore\r
 846 }\r
 847 \r
 848 enum IUPredResult {\r
 849         /**\r
 850          * Short disorder\r
 851          */\r
 852         Short,\r
 853         /**\r
 854          * Long disorder\r
 855          */\r
 856         Long,\r
 857         /**\r
 858          * Globular domains\r
 859          */\r
 860         Glob;\r
 861 \r
 862         static IUPredResult getType(File file) {\r
 863                 assert file != null;\r
 864                 String name = file.getName();\r
 865                 if (name.endsWith(Long.toString().toLowerCase())) {\r
 866                         return Long;\r
 867                 }\r
 868                 if (name.endsWith(Short.toString().toLowerCase())) {\r
 869                         return Short;\r
 870                 }\r
 871                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
 872                         return Glob;\r
 873                 }\r
 874                 throw new AssertionError(\r
 875                                 "IUPred result file type cannot be recognised! "\r
 876                                                 + "\nFile must ends with one of [glob, long or short]"\r
 877                                                 + "\n but given file name was: " + file.getName());\r
 878         }\r
 879 }\r