datamodel/compbio/data/sequence/SequenceUtil.java

   1 /* Copyright (c) 2011 Peter Troshin\r
   2  *  \r
   3  *  JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0     \r
   4  * \r
   5  *  This library is free software; you can redistribute it and/or modify it under the terms of the\r
   6  *  Apache License version 2 as published by the Apache Software Foundation\r
   7  * \r
   8  *  This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without\r
   9  *  even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache \r
  10  *  License for more details.\r
  11  * \r
  12  *  A copy of the license is in apache_license.txt. It is also available here:\r
  13  * @see: http://www.apache.org/licenses/LICENSE-2.0.txt\r
  14  * \r
  15  * Any republication or derived work distributed in source code form\r
  16  * must include this copyright and license notice.\r
  17  */\r
  18 \r
  19 package compbio.data.sequence;\r
  20 \r
  21 import java.io.BufferedReader;\r
  22 import java.io.BufferedWriter;\r
  23 import java.io.Closeable;\r
  24 import java.io.File;\r
  25 import java.io.FileInputStream;\r
  26 import java.io.IOException;\r
  27 import java.io.InputStream;\r
  28 import java.io.InputStreamReader;\r
  29 import java.io.OutputStream;\r
  30 import java.io.OutputStreamWriter;\r
  31 import java.util.ArrayList;\r
  32 import java.util.HashMap;\r
  33 import java.util.HashSet;\r
  34 import java.util.List;\r
  35 import java.util.Map;\r
  36 import java.util.Scanner;\r
  37 import java.util.Set;\r
  38 import java.util.TreeSet;\r
  39 import java.util.logging.Level;\r
  40 import java.util.regex.Matcher;\r
  41 import java.util.regex.Pattern;\r
  42 \r
  43 import compbio.util.Util;\r
  44 \r
  45 /**\r
  46  * Utility class for operations on sequences\r
  47  * \r
  48  * @author Peter Troshin\r
  49  * @since 1.0\r
  50  * @version 2.0 June 2011\r
  51  */\r
  52 public final class SequenceUtil {\r
  53 \r
  54         /**\r
  55          * A whitespace character: [\t\n\x0B\f\r]\r
  56          */\r
  57         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  58 \r
  59         /**\r
  60          * A digit\r
  61          */\r
  62         public static final Pattern DIGIT = Pattern.compile("\\d");\r
  63 \r
  64         /**\r
  65          * Non word\r
  66          */\r
  67         public static final Pattern NONWORD = Pattern.compile("\\W");\r
  68 \r
  69         /**\r
  70          * Valid Amino acids\r
  71          */\r
  72         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  73                         Pattern.CASE_INSENSITIVE);\r
  74 \r
  75         /**\r
  76          * inversion of AA pattern\r
  77          */\r
  78         public static final Pattern NON_AA = Pattern.compile(\r
  79                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  80 \r
  81         /**\r
  82          * Same as AA pattern but with two additional letters - XU\r
  83          */\r
  84         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  85                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  86 \r
  87         /**\r
  88          * Nucleotides a, t, g, c, u\r
  89          */\r
  90         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  91                         Pattern.CASE_INSENSITIVE);\r
  92 \r
  93         /**\r
  94          * Ambiguous nucleotide\r
  95          */\r
  96         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
  97                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
  98         /**\r
  99          * Non nucleotide\r
 100          */\r
 101         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
 102                         Pattern.CASE_INSENSITIVE);\r
 103 \r
 104         private SequenceUtil() {\r
 105         } // utility class, no instantiation\r
 106 \r
 107         /**\r
 108          * @return true is the sequence contains only letters a,c, t, g, u\r
 109          */\r
 110         public static boolean isNucleotideSequence(final FastaSequence s) {\r
 111                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 112         }\r
 113 \r
 114         /**\r
 115          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 116          * (!) - B char\r
 117          */\r
 118         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 119                 sequence = SequenceUtil.cleanSequence(sequence);\r
 120                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 121                         return false;\r
 122                 }\r
 123                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 124                         return false;\r
 125                         /*\r
 126                          * System.out.format("I found the text starting at " +\r
 127                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 128                          * nonDNAmatcher.end());\r
 129                          */\r
 130                 }\r
 131                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 132                 return DNAmatcher.find();\r
 133         }\r
 134 \r
 135         /**\r
 136          * Removes all whitespace chars in the sequence string\r
 137          * \r
 138          * @param sequence\r
 139          * @return cleaned up sequence\r
 140          */\r
 141         public static String cleanSequence(String sequence) {\r
 142                 assert sequence != null;\r
 143                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 144                 sequence = m.replaceAll("").toUpperCase();\r
 145                 return sequence;\r
 146         }\r
 147 \r
 148         /**\r
 149          * Removes all special characters and digits as well as whitespace chars\r
 150          * from the sequence\r
 151          * \r
 152          * @param sequence\r
 153          * @return cleaned up sequence\r
 154          */\r
 155         public static String deepCleanSequence(String sequence) {\r
 156                 sequence = SequenceUtil.cleanSequence(sequence);\r
 157                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 158                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 159                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 160                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 161                 return sequence;\r
 162         }\r
 163 \r
 164         /**\r
 165          * Remove all non AA chars from the sequence\r
 166          * \r
 167          * @param sequence\r
 168          *            the sequence to clean\r
 169          * @return cleaned sequence\r
 170          */\r
 171         public static String cleanProteinSequence(String sequence) {\r
 172                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
 173         }\r
 174 \r
 175         /**\r
 176          * @param sequence\r
 177          * @return true is the sequence is a protein sequence, false overwise\r
 178          */\r
 179         public static boolean isProteinSequence(String sequence) {\r
 180                 sequence = SequenceUtil.cleanSequence(sequence);\r
 181                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 182                         return false;\r
 183                 }\r
 184                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 185                         return false;\r
 186                 }\r
 187                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 188                         return false;\r
 189                 }\r
 190                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 191                 return protmatcher.find();\r
 192         }\r
 193 \r
 194         /**\r
 195          * Check whether the sequence confirms to amboguous protein sequence\r
 196          * \r
 197          * @param sequence\r
 198          * @return return true only if the sequence if ambiguous protein sequence\r
 199          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 200          *         protein or DNA\r
 201          */\r
 202         public static boolean isAmbiguosProtein(String sequence) {\r
 203                 sequence = SequenceUtil.cleanSequence(sequence);\r
 204                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 205                         return false;\r
 206                 }\r
 207                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 208                         return false;\r
 209                 }\r
 210                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 211                         return false;\r
 212                 }\r
 213                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
 214                         return false;\r
 215                 }\r
 216                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 217                 return amb_prot.find();\r
 218         }\r
 219 \r
 220         /**\r
 221          * Writes list of FastaSequeces into the outstream formatting the sequence\r
 222          * so that it contains width chars on each line\r
 223          * \r
 224          * @param outstream\r
 225          * @param sequences\r
 226          * @param width\r
 227          *            - the maximum number of characters to write in one line\r
 228          * @throws IOException\r
 229          */\r
 230         public static void writeFasta(final OutputStream outstream,\r
 231                         final List<FastaSequence> sequences, final int width)\r
 232                         throws IOException {\r
 233                 writeFastaKeepTheStream(outstream, sequences, width);\r
 234                 outstream.close();\r
 235         }\r
 236 \r
 237         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
 238                         final List<FastaSequence> sequences, final int width)\r
 239                         throws IOException {\r
 240                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 241                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 242                 for (final FastaSequence fs : sequences) {\r
 243                         fastawriter.write(">" + fs.getId() + "\n");\r
 244                         fastawriter.write(fs.getFormatedSequence(width));\r
 245                         fastawriter.write("\n");\r
 246                 }\r
 247                 fastawriter.flush();\r
 248                 writer.flush();\r
 249         }\r
 250 \r
 251         /**\r
 252          * Reads fasta sequences from inStream into the list of FastaSequence\r
 253          * objects\r
 254          * \r
 255          * @param inStream\r
 256          *            from\r
 257          * @return list of FastaSequence objects\r
 258          * @throws IOException\r
 259          */\r
 260         public static List<FastaSequence> readFasta(final InputStream inStream)\r
 261                         throws IOException {\r
 262                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 263                 FastaReader reader = new FastaReader(inStream);\r
 264                 while (reader.hasNext()) {\r
 265                         seqs.add(reader.next());\r
 266                 }\r
 267                 inStream.close();\r
 268                 return seqs;\r
 269         }\r
 270 \r
 271         /**\r
 272          * Writes FastaSequence in the file, each sequence will take one line only\r
 273          * \r
 274          * @param os\r
 275          * @param sequences\r
 276          * @throws IOException\r
 277          */\r
 278         public static void writeFasta(final OutputStream os,\r
 279                         final List<FastaSequence> sequences) throws IOException {\r
 280                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 281                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 282                 for (final FastaSequence fs : sequences) {\r
 283                         fasta_out.write(fs.getOnelineFasta());\r
 284                 }\r
 285                 fasta_out.close();\r
 286                 outWriter.close();\r
 287         }\r
 288 \r
 289         /**\r
 290          * Read IUPred output\r
 291          * \r
 292          * @param result\r
 293          * @return Map key->sequence name, value->Score\r
 294          * @throws IOException\r
 295          * @throws UnknownFileFormatException\r
 296          */\r
 297         public static Map<String, Score> readIUPred(final File result)\r
 298                         throws IOException, UnknownFileFormatException {\r
 299                 InputStream input = new FileInputStream(result);\r
 300                 Map<String, Score> sequences = readIUPred(input,\r
 301                                 IUPredResult.getType(result));\r
 302                 input.close();\r
 303                 return sequences;\r
 304         }\r
 305 \r
 306         // Check the type of the file e.g. long| short or domain\r
 307         // and read\r
 308         /**\r
 309          * ## Long Disorder\r
 310          * \r
 311          * # P53_HUMAN\r
 312          * \r
 313          * 1 M 0.9943\r
 314          * \r
 315          * 2 E 0.9917\r
 316          * \r
 317          * 3 E 0.9879\r
 318          * \r
 319          * (every line)\r
 320          * \r
 321          * @throws IOException\r
 322          * @throws UnknownFileFormatException\r
 323          * \r
 324          * \r
 325          */\r
 326         private static Map<String, Score> readIUPred(InputStream input,\r
 327                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
 328 \r
 329                 Score score = null;\r
 330                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 331                 Scanner scan = new Scanner(input);\r
 332                 scan.useDelimiter("#");\r
 333                 while (scan.hasNext()) {\r
 334                         String nextEntry = scan.next();\r
 335                         Scanner entry = new Scanner(nextEntry);\r
 336                         String name = entry.nextLine().trim();\r
 337                         // inside entry:\r
 338                         if (IUPredResult.Glob == type) {\r
 339                                 // parse domains\r
 340                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
 341                                 score = new Score(type, ranges);\r
 342                         } else {\r
 343                                 // parse short | long\r
 344                                 float[] scores = parseIUPredScores(entry);\r
 345                                 score = new Score(type, scores);\r
 346                         }\r
 347                         entry.close();\r
 348                         seqs.put(name, score);\r
 349                 }\r
 350 \r
 351                 scan.close();\r
 352                 return seqs;\r
 353         }\r
 354 \r
 355         /**\r
 356          * # P53_HUMA\r
 357          * \r
 358          * Number of globular domains: 2\r
 359          * \r
 360          * globular domain 1. 98 - 269\r
 361          * \r
 362          * globular domain 2. 431 - 482\r
 363          * \r
 364          * >P53_HUMA\r
 365          * \r
 366          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
 367          * \r
 368          * @param scan\r
 369          */\r
 370         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
 371                 String header = "Number of globular domains:";\r
 372                 String domainPref = "globular domain";\r
 373                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 374                 String line = scan.nextLine().trim();\r
 375                 assert line.startsWith(header);\r
 376                 line = line.substring(header.length()).trim();\r
 377                 int domainNum = Integer.parseInt(line);\r
 378                 if (domainNum == 0) {\r
 379                         return ranges;\r
 380                 }\r
 381 \r
 382                 for (int i = 0; i < domainNum; i++) {\r
 383                         assert scan.hasNextLine();\r
 384                         line = scan.nextLine();\r
 385                         assert line.trim().startsWith(domainPref);\r
 386                         line = line.substring(line.indexOf(".") + 1).trim();\r
 387                         Range r = new Range(line.split("-"));\r
 388                         ranges.add(r);\r
 389                 }\r
 390 \r
 391                 return ranges;\r
 392         }\r
 393         /*\r
 394          * 1 M 0.9943\r
 395          * \r
 396          * 2 E 0.9917\r
 397          */\r
 398         private static float[] parseIUPredScores(Scanner scan)\r
 399                         throws UnknownFileFormatException {\r
 400                 List<String> annotation = new ArrayList<String>();\r
 401                 while (scan.hasNextLine()) {\r
 402                         String line = scan.nextLine().trim();\r
 403                         String[] val = line.split("\\s+");\r
 404                         annotation.add(val[2]);\r
 405                 }\r
 406                 return convertToNumber(annotation\r
 407                                 .toArray(new String[annotation.size()]));\r
 408         }\r
 409 \r
 410         public static Map<String, Score> readJRonn(final File result)\r
 411                         throws IOException, UnknownFileFormatException {\r
 412                 InputStream input = new FileInputStream(result);\r
 413                 Map<String, Score> sequences = readJRonn(input);\r
 414                 input.close();\r
 415                 return sequences;\r
 416         }\r
 417 \r
 418         /**\r
 419          * Reader for JRonn horizontal file format\r
 420          * \r
 421          * <pre>\r
 422          * &gtFoobar M G D T T A G 0.48 0.42\r
 423          * 0.42 0.48 0.52 0.53 0.54\r
 424          * \r
 425          * <pre>\r
 426          * Where all values are tab delimited\r
 427          * \r
 428          * @param inStream\r
 429          *            the InputStream connected to the JRonn output file\r
 430          * @return Map key=sequence name value=Score\r
 431          * @throws IOException\r
 432          *             is thrown if the inStream has problems accessing the data\r
 433          * @throws UnknownFileFormatException\r
 434          *             is thrown if the inStream represents an unknown source of\r
 435          * data, i.e. not a JRonn output\r
 436          */\r
 437         public static Map<String, Score> readJRonn(final InputStream inStream)\r
 438                         throws IOException, UnknownFileFormatException {\r
 439                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 440 \r
 441                 final BufferedReader infasta = new BufferedReader(\r
 442                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 443 \r
 444                 String line;\r
 445                 String sname = "";\r
 446                 do {\r
 447                         line = infasta.readLine();\r
 448                         if (line == null || line.isEmpty()) {\r
 449                                 // skip empty lines\r
 450                                 continue;\r
 451                         }\r
 452                         if (line.startsWith(">")) {\r
 453                                 // read name\r
 454                                 sname = line.trim().substring(1);\r
 455                                 // read sequence line\r
 456                                 line = infasta.readLine();\r
 457                                 final String sequence = line.replace("\t", "");\r
 458                                 // read annotation line\r
 459                                 line = infasta.readLine();\r
 460                                 String[] annotValues = line.split("\t");\r
 461                                 float[] annotation = convertToNumber(annotValues);\r
 462                                 if (annotation.length != sequence.length()) {\r
 463                                         throw new UnknownFileFormatException(\r
 464                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
 465                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
 466                                 }\r
 467                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
 468                         }\r
 469                 } while (line != null);\r
 470 \r
 471                 infasta.close();\r
 472                 return seqs;\r
 473         }\r
 474 \r
 475         private static float[] convertToNumber(String[] annotValues)\r
 476                         throws UnknownFileFormatException {\r
 477                 float[] annotation = new float[annotValues.length];\r
 478                 try {\r
 479                         for (int i = 0; i < annotation.length; i++) {\r
 480                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 481                         }\r
 482                 } catch (NumberFormatException e) {\r
 483                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
 484                                         e.getCause());\r
 485                 }\r
 486                 return annotation;\r
 487         }\r
 488 \r
 489         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 490                         + ">sequence_name\n "\r
 491                         + "M    V       S\n"\r
 492                         + "0.43 0.22    0.65\n"\r
 493                         + "Where first line is the sequence name,\n"\r
 494                         + "second line is the tab delimited sequence,\n"\r
 495                         + "third line contains tab delimited disorder prediction values.\n"\r
 496                         + "No lines are allowed between these three. Additionally, the number of  "\r
 497                         + "sequence residues must be equal to the number of the disorder values.";\r
 498 \r
 499         /**\r
 500          * Closes the Closable and logs the exception if any\r
 501          * \r
 502          * @param log\r
 503          * @param stream\r
 504          */\r
 505         public final static void closeSilently(java.util.logging.Logger log,\r
 506                         Closeable stream) {\r
 507                 if (stream != null) {\r
 508                         try {\r
 509                                 stream.close();\r
 510                         } catch (IOException e) {\r
 511                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 512                         }\r
 513                 }\r
 514         }\r
 515 \r
 516         /**\r
 517          * \r
 518          > Foobar_dundeefriends\r
 519          * \r
 520          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 521          * \r
 522          * # REM465 355-368\r
 523          * \r
 524          * # HOTLOOPS 190-204\r
 525          * \r
 526          * # RESIDUE COILS REM465 HOTLOOPS\r
 527          * \r
 528          * M 0.86010 0.88512 0.37094\r
 529          * \r
 530          * T 0.79983 0.85864 0.44331\r
 531          * \r
 532          * >Next Sequence name\r
 533          * \r
 534          * \r
 535          * @param input\r
 536          *            the InputStream\r
 537          * @return Map key=sequence name, value=set of score\r
 538          * @throws IOException\r
 539          * @throws UnknownFileFormatException\r
 540          */\r
 541         public static HashMap<String, Set<Score>> readDisembl(\r
 542                         final InputStream input) throws IOException,\r
 543                         UnknownFileFormatException {\r
 544                 Scanner scan = new Scanner(input);\r
 545                 scan.useDelimiter(">");\r
 546                 if (!scan.hasNext()) {\r
 547                         throw new UnknownFileFormatException(\r
 548                                         "In Disembl score format each sequence score is expected "\r
 549                                                         + "to start from the line: >Sequence name "\r
 550                                                         + " No such line was found!");\r
 551                 }\r
 552 \r
 553                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 554                 int seqCounter = 0;\r
 555                 while (scan.hasNext()) {\r
 556                         seqCounter++;\r
 557                         String singleSeq = scan.next();\r
 558                         Scanner scansingle = new Scanner(singleSeq);\r
 559                         if (!scansingle.hasNextLine()) {\r
 560                                 throw new RuntimeException(\r
 561                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
 562                         }\r
 563 \r
 564                         StringBuffer seqbuffer = new StringBuffer();\r
 565                         ArrayList<Float> coils = new ArrayList<Float>();\r
 566                         ArrayList<Float> rem = new ArrayList<Float>();\r
 567                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
 568 \r
 569                         String sequenceName = scansingle.nextLine().trim();\r
 570                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
 571                                         scansingle.nextLine());\r
 572                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
 573                                         scansingle.nextLine());\r
 574                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
 575                                         scansingle.nextLine());\r
 576 \r
 577                         String title = scansingle.nextLine();\r
 578                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
 579 \r
 580                         while (scansingle.hasNext()) {\r
 581                                 seqbuffer.append(scansingle.next());\r
 582                                 coils.add(scansingle.nextFloat());\r
 583                                 rem.add(scansingle.nextFloat());\r
 584                                 hotloops.add(scansingle.nextFloat());\r
 585                         }\r
 586                         /*\r
 587                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 588                          * seqbuffer.toString());\r
 589                          */\r
 590                         HashSet<Score> scores = new HashSet<Score>();\r
 591                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
 592                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
 593                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
 594                         results.put(sequenceName, scores);\r
 595 \r
 596                         scansingle.close();\r
 597                 }\r
 598                 scan.close();\r
 599                 input.close();\r
 600                 return results;\r
 601         }\r
 602 \r
 603         /**\r
 604          * Parsing:\r
 605          * \r
 606          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
 607          * 350-391, 429-485, 497-506, 539-547\r
 608          * \r
 609          * # REM465 355-368\r
 610          * \r
 611          * # HOTLOOPS 190-204\r
 612          * \r
 613          * @param lines\r
 614          * @return\r
 615          */\r
 616         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
 617                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 618 \r
 619                 Scanner scan = new Scanner(lines);\r
 620 \r
 621                 assert scan.hasNext();\r
 622                 String del = scan.next();\r
 623                 assert "#".equals(del); // pass delimiter #\r
 624                 String type = scan.next(); // pass enum name e.g. COILS\r
 625                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
 626                                 + resultType.toString();\r
 627 \r
 628                 // beginning of the ranges\r
 629                 scan.useDelimiter(",");\r
 630                 while (scan.hasNext()) {\r
 631                         String range = scan.next();\r
 632                         if (!Util.isEmpty(range)) {\r
 633                                 ranges.add(new Range(range.split("-")));\r
 634                         }\r
 635                 }\r
 636                 return ranges;\r
 637         }\r
 638 \r
 639         /**\r
 640          * \r
 641          > Foobar_dundeefriends\r
 642          * \r
 643          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 644          * \r
 645          * # REM465 355-368\r
 646          * \r
 647          * # HOTLOOPS 190-204\r
 648          * \r
 649          * # RESIDUE COILS REM465 HOTLOOPS\r
 650          * \r
 651          * M 0.86010 0.88512 0.37094\r
 652          * \r
 653          * T 0.79983 0.85864 0.44331\r
 654          * \r
 655          * >Next Sequence name\r
 656          * \r
 657          * \r
 658          * @param input\r
 659          * @return Map key=sequence name, value=set of score\r
 660          * @throws IOException\r
 661          * @throws UnknownFileFormatException\r
 662          */\r
 663         public static HashMap<String, Set<Score>> readGlobPlot(\r
 664                         final InputStream input) throws IOException,\r
 665                         UnknownFileFormatException {\r
 666                 Scanner scan = new Scanner(input);\r
 667                 scan.useDelimiter(">");\r
 668                 if (!scan.hasNext()) {\r
 669                         throw new UnknownFileFormatException(\r
 670                                         "In GlobPlot score format each sequence score is expected "\r
 671                                                         + "to start from the line: >Sequence name "\r
 672                                                         + " No such line was found!");\r
 673                 }\r
 674 \r
 675                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 676                 int seqCounter = 0;\r
 677                 while (scan.hasNext()) {\r
 678                         seqCounter++;\r
 679                         String singleSeq = scan.next();\r
 680                         Scanner scansingle = new Scanner(singleSeq);\r
 681                         if (!scansingle.hasNextLine()) {\r
 682                                 throw new RuntimeException(\r
 683                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
 684                         }\r
 685 \r
 686                         StringBuffer seqbuffer = new StringBuffer();\r
 687                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
 688                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
 689                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
 690 \r
 691                         String sequenceName = scansingle.nextLine().trim();\r
 692                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
 693                                         scansingle.nextLine());\r
 694                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
 695                                         scansingle.nextLine());\r
 696 \r
 697                         String title = scansingle.nextLine();\r
 698                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
 699 \r
 700                         while (scansingle.hasNext()) {\r
 701                                 seqbuffer.append(scansingle.next());\r
 702                                 dydxScore.add(scansingle.nextFloat());\r
 703                                 rawScore.add(scansingle.nextFloat());\r
 704                                 smoothedScore.add(scansingle.nextFloat());\r
 705                         }\r
 706                         /*\r
 707                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 708                          * seqbuffer.toString());\r
 709                          */\r
 710                         Set<Score> scores = new TreeSet<Score>();\r
 711                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
 712                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
 713                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
 714                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
 715                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
 716                         results.put(sequenceName, scores);\r
 717 \r
 718                         scansingle.close();\r
 719                 }\r
 720                 scan.close();\r
 721                 input.close();\r
 722                 return results;\r
 723         }\r
 724         /**\r
 725          * Read AACon result with no alignment files. This method leaves incoming\r
 726          * InputStream open!\r
 727          * \r
 728          * @param results\r
 729          *            output file of AAConservation\r
 730          * @return Map with keys {@link ConservationMethod} -> float[]\r
 731          */\r
 732         public static HashSet<Score> readAAConResults(InputStream results) {\r
 733                 if (results == null) {\r
 734                         throw new NullPointerException(\r
 735                                         "InputStream with results must be provided");\r
 736                 }\r
 737                 HashSet<Score> annotations = new HashSet<Score>();\r
 738                 Scanner sc = new Scanner(results);\r
 739                 sc.useDelimiter("#");\r
 740                 while (sc.hasNext()) {\r
 741                         String line = sc.next();\r
 742                         int spacePos = line.indexOf(" ");\r
 743                         assert spacePos > 0 : "Space is expected as delimited between method "\r
 744                                         + "name and values!";\r
 745                         String methodLine = line.substring(0, spacePos);\r
 746                         ConservationMethod method = ConservationMethod\r
 747                                         .getMethod(methodLine);\r
 748                         assert method != null : "Method " + methodLine\r
 749                                         + " is not recognized! ";\r
 750                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
 751                         ArrayList<Float> values = new ArrayList<Float>();\r
 752                         while (valuesScanner.hasNextDouble()) {\r
 753                                 Double value = valuesScanner.nextDouble();\r
 754                                 values.add(value.floatValue());\r
 755                         }\r
 756                         annotations.add(new Score(method, values));\r
 757                 }\r
 758                 return annotations;\r
 759         }\r
 760 \r
 761         /**\r
 762          * Reads and parses Fasta or Clustal formatted file into a list of\r
 763          * FastaSequence objects\r
 764          * \r
 765          * @param inFilePath\r
 766          *            the path to the input file\r
 767          * @throws IOException\r
 768          *             if the file denoted by inFilePath cannot be read\r
 769          * @throws UnknownFileFormatException\r
 770          *             if the inFilePath points to the file which format cannot be\r
 771          *             recognised\r
 772          * @return the List of FastaSequence objects\r
 773          * \r
 774          */\r
 775         public static List<FastaSequence> openInputStream(String inFilePath)\r
 776                         throws IOException, UnknownFileFormatException {\r
 777 \r
 778                 // This stream gets closed in isValidClustalFile method\r
 779                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
 780                 // This stream is closed in the calling methods\r
 781                 InputStream inStr = new FileInputStream(inFilePath);\r
 782                 List<FastaSequence> fastaSeqs = null;\r
 783                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
 784                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
 785                         // alignment cannot be null see\r
 786                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
 787                         fastaSeqs = al.getSequences();\r
 788                 } else {\r
 789                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
 790                 }\r
 791                 return fastaSeqs;\r
 792         }\r
 793 \r
 794 }\r
 795 \r
 796 enum DisemblResult {\r
 797         /** These contains ranges and scores */\r
 798         COILS, REM465, HOTLOOPS\r
 799 }\r
 800 enum GlobProtResult {\r
 801         /** This a range with no scores */\r
 802         GlobDoms,\r
 803         /** This a range with no scores */\r
 804         Disorder,\r
 805         /** This a score with no range */\r
 806         Dydx,\r
 807         /** This a score with no range */\r
 808         SmoothedScore,\r
 809         /** This a score with no range */\r
 810         RawScore\r
 811 }\r
 812 \r
 813 enum IUPredResult {\r
 814         /**\r
 815          * Short disorder\r
 816          */\r
 817         Short,\r
 818         /**\r
 819          * Long disorder\r
 820          */\r
 821         Long,\r
 822         /**\r
 823          * Globular domains\r
 824          */\r
 825         Glob;\r
 826 \r
 827         static IUPredResult getType(File file) {\r
 828                 assert file != null;\r
 829                 String name = file.getName();\r
 830                 if (name.endsWith(Long.toString().toLowerCase())) {\r
 831                         return Long;\r
 832                 }\r
 833                 if (name.endsWith(Short.toString().toLowerCase())) {\r
 834                         return Short;\r
 835                 }\r
 836                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
 837                         return Glob;\r
 838                 }\r
 839                 throw new AssertionError(\r
 840                                 "IUPred result file type cannot be recognised! "\r
 841                                                 + "\nFile must ends with one of [glob, long or short]"\r
 842                                                 + "\n but given file name was: " + file.getName());\r
 843         }\r
 844 }