datamodel/compbio/data/sequence/SequenceUtil.java

   1 /*\r
   2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
   3  * Jalview Web Services version: 2.0 This library is free software; you can\r
   4  * redistribute it and/or modify it under the terms of the Apache License\r
   5  * version 2 as published by the Apache Software Foundation This library is\r
   6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
   7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
   8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
   9  * license is in apache_license.txt. It is also available here: see:\r
  10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
  11  * work distributed in source code form must include this copyright and license\r
  12  * notice.\r
  13  */\r
  14 \r
  15 package compbio.data.sequence;\r
  16 \r
  17 import java.io.BufferedReader;\r
  18 import java.io.BufferedWriter;\r
  19 import java.io.Closeable;\r
  20 import java.io.File;\r
  21 import java.io.FileInputStream;\r
  22 import java.io.IOException;\r
  23 import java.io.InputStream;\r
  24 import java.io.InputStreamReader;\r
  25 import java.io.OutputStream;\r
  26 import java.io.OutputStreamWriter;\r
  27 import java.util.ArrayList;\r
  28 import java.util.HashMap;\r
  29 import java.util.HashSet;\r
  30 import java.util.List;\r
  31 import java.util.Map;\r
  32 import java.util.Scanner;\r
  33 import java.util.Set;\r
  34 import java.util.TreeSet;\r
  35 import java.util.logging.Level;\r
  36 import java.util.regex.Matcher;\r
  37 import java.util.regex.Pattern;\r
  38 \r
  39 import compbio.util.Util;\r
  40 \r
  41 /**\r
  42  * Utility class for operations on sequences\r
  43  * \r
  44  * @author Peter Troshin\r
  45  * @since 1.0\r
  46  * @version 2.0 June 2011\r
  47  */\r
  48 public final class SequenceUtil {\r
  49 \r
  50         /**\r
  51          * A whitespace character: [\t\n\x0B\f\r]\r
  52          */\r
  53         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  54 \r
  55         /**\r
  56          * A digit\r
  57          */\r
  58         public static final Pattern DIGIT = Pattern.compile("\\d");\r
  59 \r
  60         /**\r
  61          * Non word\r
  62          */\r
  63         public static final Pattern NONWORD = Pattern.compile("\\W");\r
  64 \r
  65         /**\r
  66          * Valid Amino acids\r
  67          */\r
  68         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  69                         Pattern.CASE_INSENSITIVE);\r
  70 \r
  71         /**\r
  72          * inversion of AA pattern\r
  73          */\r
  74         public static final Pattern NON_AA = Pattern.compile(\r
  75                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  76 \r
  77         /**\r
  78          * Same as AA pattern but with two additional letters - XU\r
  79          */\r
  80         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  81                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  82 \r
  83         /**\r
  84          * Nucleotides a, t, g, c, u\r
  85          */\r
  86         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  87                         Pattern.CASE_INSENSITIVE);\r
  88 \r
  89         /**\r
  90          * Ambiguous nucleotide\r
  91          */\r
  92         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
  93                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
  94         /**\r
  95          * Non nucleotide\r
  96          */\r
  97         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
  98                         Pattern.CASE_INSENSITIVE);\r
  99 \r
 100         private SequenceUtil() {\r
 101         } // utility class, no instantiation\r
 102 \r
 103         /**\r
 104          * @return true is the sequence contains only letters a,c, t, g, u\r
 105          */\r
 106         public static boolean isNucleotideSequence(final FastaSequence s) {\r
 107                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 108         }\r
 109 \r
 110         /**\r
 111          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 112          * (!) - B char\r
 113          */\r
 114         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 115                 sequence = SequenceUtil.cleanSequence(sequence);\r
 116                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 117                         return false;\r
 118                 }\r
 119                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 120                         return false;\r
 121                         /*\r
 122                          * System.out.format("I found the text starting at " +\r
 123                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 124                          * nonDNAmatcher.end());\r
 125                          */\r
 126                 }\r
 127                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 128                 return DNAmatcher.find();\r
 129         }\r
 130 \r
 131         /**\r
 132          * Removes all whitespace chars in the sequence string\r
 133          * \r
 134          * @param sequence\r
 135          * @return cleaned up sequence\r
 136          */\r
 137         public static String cleanSequence(String sequence) {\r
 138                 assert sequence != null;\r
 139                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 140                 sequence = m.replaceAll("").toUpperCase();\r
 141                 return sequence;\r
 142         }\r
 143 \r
 144         /**\r
 145          * Removes all special characters and digits as well as whitespace chars\r
 146          * from the sequence\r
 147          * \r
 148          * @param sequence\r
 149          * @return cleaned up sequence\r
 150          */\r
 151         public static String deepCleanSequence(String sequence) {\r
 152                 sequence = SequenceUtil.cleanSequence(sequence);\r
 153                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 154                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 155                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 156                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 157                 return sequence;\r
 158         }\r
 159 \r
 160         /**\r
 161          * Remove all non AA chars from the sequence\r
 162          * \r
 163          * @param sequence\r
 164          *            the sequence to clean\r
 165          * @return cleaned sequence\r
 166          */\r
 167         public static String cleanProteinSequence(String sequence) {\r
 168                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
 169         }\r
 170 \r
 171         /**\r
 172          * @param sequence\r
 173          * @return true is the sequence is a protein sequence, false overwise\r
 174          */\r
 175         public static boolean isProteinSequence(String sequence) {\r
 176                 sequence = SequenceUtil.cleanSequence(sequence);\r
 177                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 178                         return false;\r
 179                 }\r
 180                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 181                         return false;\r
 182                 }\r
 183                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 184                         return false;\r
 185                 }\r
 186                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 187                 return protmatcher.find();\r
 188         }\r
 189 \r
 190         /**\r
 191          * Check whether the sequence confirms to amboguous protein sequence\r
 192          * \r
 193          * @param sequence\r
 194          * @return return true only if the sequence if ambiguous protein sequence\r
 195          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 196          *         protein or DNA\r
 197          */\r
 198         public static boolean isAmbiguosProtein(String sequence) {\r
 199                 sequence = SequenceUtil.cleanSequence(sequence);\r
 200                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 201                         return false;\r
 202                 }\r
 203                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 204                         return false;\r
 205                 }\r
 206                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 207                         return false;\r
 208                 }\r
 209                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
 210                         return false;\r
 211                 }\r
 212                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 213                 return amb_prot.find();\r
 214         }\r
 215 \r
 216         /**\r
 217          * Writes list of FastaSequeces into the outstream formatting the sequence\r
 218          * so that it contains width chars on each line\r
 219          * \r
 220          * @param outstream\r
 221          * @param sequences\r
 222          * @param width\r
 223          *            - the maximum number of characters to write in one line\r
 224          * @throws IOException\r
 225          */\r
 226         public static void writeFasta(final OutputStream outstream,\r
 227                         final List<FastaSequence> sequences, final int width)\r
 228                         throws IOException {\r
 229                 writeFastaKeepTheStream(outstream, sequences, width);\r
 230                 outstream.close();\r
 231         }\r
 232 \r
 233         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
 234                         final List<FastaSequence> sequences, final int width)\r
 235                         throws IOException {\r
 236                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 237                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 238                 for (final FastaSequence fs : sequences) {\r
 239                         fastawriter.write(">" + fs.getId() + "\n");\r
 240                         fastawriter.write(fs.getFormatedSequence(width));\r
 241                         fastawriter.write("\n");\r
 242                 }\r
 243                 fastawriter.flush();\r
 244                 writer.flush();\r
 245         }\r
 246 \r
 247         /**\r
 248          * Reads fasta sequences from inStream into the list of FastaSequence\r
 249          * objects\r
 250          * \r
 251          * @param inStream\r
 252          *            from\r
 253          * @return list of FastaSequence objects\r
 254          * @throws IOException\r
 255          */\r
 256         public static List<FastaSequence> readFasta(final InputStream inStream)\r
 257                         throws IOException {\r
 258                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 259                 FastaReader reader = new FastaReader(inStream);\r
 260                 while (reader.hasNext()) {\r
 261                         seqs.add(reader.next());\r
 262                 }\r
 263                 inStream.close();\r
 264                 return seqs;\r
 265         }\r
 266 \r
 267         /**\r
 268          * Writes FastaSequence in the file, each sequence will take one line only\r
 269          * \r
 270          * @param os\r
 271          * @param sequences\r
 272          * @throws IOException\r
 273          */\r
 274         public static void writeFasta(final OutputStream os,\r
 275                         final List<FastaSequence> sequences) throws IOException {\r
 276                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 277                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 278                 for (final FastaSequence fs : sequences) {\r
 279                         fasta_out.write(fs.getOnelineFasta());\r
 280                 }\r
 281                 fasta_out.close();\r
 282                 outWriter.close();\r
 283         }\r
 284 \r
 285         /**\r
 286          * Read IUPred output\r
 287          * \r
 288          * @param result\r
 289          * @return\r
 290          * @throws IOException\r
 291          * @throws UnknownFileFormatException\r
 292          */\r
 293         public static Map<String, Score> readIUPred(final File result)\r
 294                         throws IOException, UnknownFileFormatException {\r
 295                 InputStream input = new FileInputStream(result);\r
 296                 Map<String, Score> sequences = readIUPred(input,\r
 297                                 IUPredResult.getType(result));\r
 298                 input.close();\r
 299                 return sequences;\r
 300         }\r
 301 \r
 302         // Check the type of the file e.g. long| short or domain\r
 303         // and read\r
 304         /**\r
 305          * ## Long Disorder\r
 306          * \r
 307          * # P53_HUMAN\r
 308          * \r
 309          * 1 M 0.9943\r
 310          * \r
 311          * 2 E 0.9917\r
 312          * \r
 313          * 3 E 0.9879\r
 314          * \r
 315          * (every line)\r
 316          * \r
 317          * @throws IOException\r
 318          * @throws UnknownFileFormatException\r
 319          * \r
 320          * \r
 321          */\r
 322         private static Map<String, Score> readIUPred(InputStream input,\r
 323                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
 324 \r
 325                 Score score = null;\r
 326                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 327                 Scanner scan = new Scanner(input);\r
 328                 scan.useDelimiter("#");\r
 329                 while (scan.hasNext()) {\r
 330                         String nextEntry = scan.next();\r
 331                         Scanner entry = new Scanner(nextEntry);\r
 332                         String name = entry.nextLine().trim();\r
 333                         // inside entry:\r
 334                         if (IUPredResult.Glob == type) {\r
 335                                 // parse domains\r
 336                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
 337                                 score = new Score(type, ranges);\r
 338                         } else {\r
 339                                 // parse short | long\r
 340                                 float[] scores = parseIUPredScores(entry);\r
 341                                 score = new Score(type, scores);\r
 342                         }\r
 343                         entry.close();\r
 344                         seqs.put(name, score);\r
 345                 }\r
 346 \r
 347                 scan.close();\r
 348                 return seqs;\r
 349         }\r
 350 \r
 351         /**\r
 352          * # P53_HUMA\r
 353          * \r
 354          * Number of globular domains: 2\r
 355          * \r
 356          * globular domain 1. 98 - 269\r
 357          * \r
 358          * globular domain 2. 431 - 482\r
 359          * \r
 360          * >P53_HUMA\r
 361          * \r
 362          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
 363          * \r
 364          * @param scan\r
 365          */\r
 366         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
 367                 String header = "Number of globular domains:";\r
 368                 String domainPref = "globular domain";\r
 369                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 370                 String line = scan.nextLine().trim();\r
 371                 assert line.startsWith(header);\r
 372                 line = line.substring(header.length()).trim();\r
 373                 int domainNum = Integer.parseInt(line);\r
 374                 if (domainNum == 0) {\r
 375                         return ranges;\r
 376                 }\r
 377 \r
 378                 for (int i = 0; i < domainNum; i++) {\r
 379                         assert scan.hasNextLine();\r
 380                         line = scan.nextLine();\r
 381                         assert line.trim().startsWith(domainPref);\r
 382                         line = line.substring(line.indexOf(".") + 1).trim();\r
 383                         Range r = new Range(line.split("-"));\r
 384                         ranges.add(r);\r
 385                 }\r
 386 \r
 387                 return ranges;\r
 388         }\r
 389         /*\r
 390          * 1 M 0.9943\r
 391          * \r
 392          * 2 E 0.9917\r
 393          */\r
 394         private static float[] parseIUPredScores(Scanner scan)\r
 395                         throws UnknownFileFormatException {\r
 396                 List<String> annotation = new ArrayList<String>();\r
 397                 while (scan.hasNextLine()) {\r
 398                         String line = scan.nextLine().trim();\r
 399                         String[] val = line.split("\\s+");\r
 400                         annotation.add(val[2]);\r
 401                 }\r
 402                 return convertToNumber(annotation\r
 403                                 .toArray(new String[annotation.size()]));\r
 404         }\r
 405 \r
 406         public static Map<String, Score> readJRonn(final File result)\r
 407                         throws IOException, UnknownFileFormatException {\r
 408                 InputStream input = new FileInputStream(result);\r
 409                 Map<String, Score> sequences = readJRonn(input);\r
 410                 input.close();\r
 411                 return sequences;\r
 412         }\r
 413 \r
 414         /**\r
 415          * Reader for JRonn horizontal file format\r
 416          * \r
 417          * <pre>\r
 418          * &gtFoobar M G D T T A G 0.48 0.42\r
 419          * 0.42 0.48 0.52 0.53 0.54\r
 420          * \r
 421          * <pre>\r
 422          * Where all values are tab delimited\r
 423          * \r
 424          * @param inStream\r
 425          *            the InputStream connected to the JRonn output file\r
 426          * @return List of {@link AnnotatedSequence} objects\r
 427          * @throws IOException\r
 428          *             is thrown if the inStream has problems accessing the data\r
 429          * @throws UnknownFileFormatException\r
 430          *             is thrown if the inStream represents an unknown source of\r
 431          * data, i.e. not a JRonn output\r
 432          */\r
 433         public static Map<String, Score> readJRonn(final InputStream inStream)\r
 434                         throws IOException, UnknownFileFormatException {\r
 435                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 436 \r
 437                 final BufferedReader infasta = new BufferedReader(\r
 438                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 439 \r
 440                 String line;\r
 441                 String sname = "";\r
 442                 do {\r
 443                         line = infasta.readLine();\r
 444                         if (line == null || line.isEmpty()) {\r
 445                                 // skip empty lines\r
 446                                 continue;\r
 447                         }\r
 448                         if (line.startsWith(">")) {\r
 449                                 // read name\r
 450                                 sname = line.trim().substring(1);\r
 451                                 // read sequence line\r
 452                                 line = infasta.readLine();\r
 453                                 final String sequence = line.replace("\t", "");\r
 454                                 // read annotation line\r
 455                                 line = infasta.readLine();\r
 456                                 String[] annotValues = line.split("\t");\r
 457                                 float[] annotation = convertToNumber(annotValues);\r
 458                                 if (annotation.length != sequence.length()) {\r
 459                                         throw new UnknownFileFormatException(\r
 460                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
 461                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
 462                                 }\r
 463                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
 464                         }\r
 465                 } while (line != null);\r
 466 \r
 467                 infasta.close();\r
 468                 return seqs;\r
 469         }\r
 470 \r
 471         private static float[] convertToNumber(String[] annotValues)\r
 472                         throws UnknownFileFormatException {\r
 473                 float[] annotation = new float[annotValues.length];\r
 474                 try {\r
 475                         for (int i = 0; i < annotation.length; i++) {\r
 476                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 477                         }\r
 478                 } catch (NumberFormatException e) {\r
 479                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
 480                                         e.getCause());\r
 481                 }\r
 482                 return annotation;\r
 483         }\r
 484 \r
 485         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 486                         + ">sequence_name\n "\r
 487                         + "M    V       S\n"\r
 488                         + "0.43 0.22    0.65\n"\r
 489                         + "Where first line is the sequence name,\n"\r
 490                         + "second line is the tab delimited sequence,\n"\r
 491                         + "third line contains tab delimited disorder prediction values.\n"\r
 492                         + "No lines are allowed between these three. Additionally, the number of  "\r
 493                         + "sequence residues must be equal to the number of the disorder values.";\r
 494 \r
 495         /**\r
 496          * Closes the Closable and logs the exception if any\r
 497          * \r
 498          * @param log\r
 499          * @param stream\r
 500          */\r
 501         public final static void closeSilently(java.util.logging.Logger log,\r
 502                         Closeable stream) {\r
 503                 if (stream != null) {\r
 504                         try {\r
 505                                 stream.close();\r
 506                         } catch (IOException e) {\r
 507                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 508                         }\r
 509                 }\r
 510         }\r
 511 \r
 512         /**\r
 513          * \r
 514          > Foobar_dundeefriends\r
 515          * \r
 516          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 517          * \r
 518          * # REM465 355-368\r
 519          * \r
 520          * # HOTLOOPS 190-204\r
 521          * \r
 522          * # RESIDUE COILS REM465 HOTLOOPS\r
 523          * \r
 524          * M 0.86010 0.88512 0.37094\r
 525          * \r
 526          * T 0.79983 0.85864 0.44331\r
 527          * \r
 528          * >Next Sequence name\r
 529          * \r
 530          * \r
 531          * @param input\r
 532          * @return\r
 533          * @throws IOException\r
 534          * @throws UnknownFileFormatException\r
 535          */\r
 536         public static HashMap<String, Set<Score>> readDisembl(\r
 537                         final InputStream input) throws IOException,\r
 538                         UnknownFileFormatException {\r
 539                 Scanner scan = new Scanner(input);\r
 540                 scan.useDelimiter(">");\r
 541                 if (!scan.hasNext()) {\r
 542                         throw new UnknownFileFormatException(\r
 543                                         "In Disembl score format each sequence score is expected "\r
 544                                                         + "to start from the line: >Sequence name "\r
 545                                                         + " No such line was found!");\r
 546                 }\r
 547 \r
 548                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 549                 int seqCounter = 0;\r
 550                 while (scan.hasNext()) {\r
 551                         seqCounter++;\r
 552                         String singleSeq = scan.next();\r
 553                         Scanner scansingle = new Scanner(singleSeq);\r
 554                         if (!scansingle.hasNextLine()) {\r
 555                                 throw new RuntimeException(\r
 556                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
 557                         }\r
 558 \r
 559                         StringBuffer seqbuffer = new StringBuffer();\r
 560                         ArrayList<Float> coils = new ArrayList<Float>();\r
 561                         ArrayList<Float> rem = new ArrayList<Float>();\r
 562                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
 563 \r
 564                         String sequenceName = scansingle.nextLine().trim();\r
 565                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
 566                                         scansingle.nextLine());\r
 567                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
 568                                         scansingle.nextLine());\r
 569                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
 570                                         scansingle.nextLine());\r
 571 \r
 572                         String title = scansingle.nextLine();\r
 573                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
 574 \r
 575                         while (scansingle.hasNext()) {\r
 576                                 seqbuffer.append(scansingle.next());\r
 577                                 coils.add(scansingle.nextFloat());\r
 578                                 rem.add(scansingle.nextFloat());\r
 579                                 hotloops.add(scansingle.nextFloat());\r
 580                         }\r
 581                         /*\r
 582                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 583                          * seqbuffer.toString());\r
 584                          */\r
 585                         HashSet<Score> scores = new HashSet<Score>();\r
 586                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
 587                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
 588                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
 589                         results.put(sequenceName, scores);\r
 590 \r
 591                         scansingle.close();\r
 592                 }\r
 593                 scan.close();\r
 594                 input.close();\r
 595                 return results;\r
 596         }\r
 597 \r
 598         /**\r
 599          * Parsing:\r
 600          * \r
 601          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
 602          * 350-391, 429-485, 497-506, 539-547\r
 603          * \r
 604          * # REM465 355-368\r
 605          * \r
 606          * # HOTLOOPS 190-204\r
 607          * \r
 608          * @param lines\r
 609          * @return\r
 610          */\r
 611         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
 612                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 613 \r
 614                 Scanner scan = new Scanner(lines);\r
 615 \r
 616                 assert scan.hasNext();\r
 617                 String del = scan.next();\r
 618                 assert "#".equals(del); // pass delimiter #\r
 619                 String type = scan.next(); // pass enum name e.g. COILS\r
 620                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
 621                                 + resultType.toString();\r
 622 \r
 623                 // beginning of the ranges\r
 624                 scan.useDelimiter(",");\r
 625                 while (scan.hasNext()) {\r
 626                         String range = scan.next();\r
 627                         if (!Util.isEmpty(range)) {\r
 628                                 ranges.add(new Range(range.split("-")));\r
 629                         }\r
 630                 }\r
 631                 return ranges;\r
 632         }\r
 633 \r
 634         /**\r
 635          * \r
 636          > Foobar_dundeefriends\r
 637          * \r
 638          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 639          * \r
 640          * # REM465 355-368\r
 641          * \r
 642          * # HOTLOOPS 190-204\r
 643          * \r
 644          * # RESIDUE COILS REM465 HOTLOOPS\r
 645          * \r
 646          * M 0.86010 0.88512 0.37094\r
 647          * \r
 648          * T 0.79983 0.85864 0.44331\r
 649          * \r
 650          * >Next Sequence name\r
 651          * \r
 652          * \r
 653          * @param input\r
 654          * @return\r
 655          * @throws IOException\r
 656          * @throws UnknownFileFormatException\r
 657          */\r
 658         public static HashMap<String, Set<Score>> readGlobPlot(\r
 659                         final InputStream input) throws IOException,\r
 660                         UnknownFileFormatException {\r
 661                 Scanner scan = new Scanner(input);\r
 662                 scan.useDelimiter(">");\r
 663                 if (!scan.hasNext()) {\r
 664                         throw new UnknownFileFormatException(\r
 665                                         "In GlobPlot score format each sequence score is expected "\r
 666                                                         + "to start from the line: >Sequence name "\r
 667                                                         + " No such line was found!");\r
 668                 }\r
 669 \r
 670                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 671                 int seqCounter = 0;\r
 672                 while (scan.hasNext()) {\r
 673                         seqCounter++;\r
 674                         String singleSeq = scan.next();\r
 675                         Scanner scansingle = new Scanner(singleSeq);\r
 676                         if (!scansingle.hasNextLine()) {\r
 677                                 throw new RuntimeException(\r
 678                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
 679                         }\r
 680 \r
 681                         StringBuffer seqbuffer = new StringBuffer();\r
 682                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
 683                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
 684                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
 685 \r
 686                         String sequenceName = scansingle.nextLine().trim();\r
 687                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
 688                                         scansingle.nextLine());\r
 689                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
 690                                         scansingle.nextLine());\r
 691 \r
 692                         String title = scansingle.nextLine();\r
 693                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
 694 \r
 695                         while (scansingle.hasNext()) {\r
 696                                 seqbuffer.append(scansingle.next());\r
 697                                 dydxScore.add(scansingle.nextFloat());\r
 698                                 rawScore.add(scansingle.nextFloat());\r
 699                                 smoothedScore.add(scansingle.nextFloat());\r
 700                         }\r
 701                         /*\r
 702                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 703                          * seqbuffer.toString());\r
 704                          */\r
 705                         Set<Score> scores = new TreeSet<Score>();\r
 706                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
 707                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
 708                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
 709                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
 710                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
 711                         results.put(sequenceName, scores);\r
 712 \r
 713                         scansingle.close();\r
 714                 }\r
 715                 scan.close();\r
 716                 input.close();\r
 717                 return results;\r
 718         }\r
 719         /**\r
 720          * Read AACon result with no alignment files. This method leaves incoming\r
 721          * InputStream open!\r
 722          * \r
 723          * @param results\r
 724          *            output file of AAConservation\r
 725          * @return Map with keys {@link ConservationMethod} -> float[]\r
 726          */\r
 727         public static HashSet<Score> readAAConResults(InputStream results) {\r
 728                 if (results == null) {\r
 729                         throw new NullPointerException(\r
 730                                         "InputStream with results must be provided");\r
 731                 }\r
 732                 HashSet<Score> annotations = new HashSet<Score>();\r
 733                 Scanner sc = new Scanner(results);\r
 734                 sc.useDelimiter("#");\r
 735                 while (sc.hasNext()) {\r
 736                         String line = sc.next();\r
 737                         int spacePos = line.indexOf(" ");\r
 738                         assert spacePos > 0 : "Space is expected as delimited between method "\r
 739                                         + "name and values!";\r
 740                         String methodLine = line.substring(0, spacePos);\r
 741                         ConservationMethod method = ConservationMethod\r
 742                                         .getMethod(methodLine);\r
 743                         assert method != null : "Method " + methodLine\r
 744                                         + " is not recognized! ";\r
 745                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
 746                         ArrayList<Float> values = new ArrayList<Float>();\r
 747                         while (valuesScanner.hasNextDouble()) {\r
 748                                 Double value = valuesScanner.nextDouble();\r
 749                                 values.add(value.floatValue());\r
 750                         }\r
 751                         annotations.add(new Score(method, values));\r
 752                 }\r
 753                 return annotations;\r
 754         }\r
 755 \r
 756         /**\r
 757          * Reads and parses Fasta or Clustal formatted file into a list of\r
 758          * FastaSequence objects\r
 759          * \r
 760          * @param inFilePath\r
 761          *            the path to the input file\r
 762          * @throws IOException\r
 763          *             if the file denoted by inFilePath cannot be read\r
 764          * @throws UnknownFileFormatException\r
 765          *             if the inFilePath points to the file which format cannot be\r
 766          *             recognised\r
 767          * @return the List of FastaSequence objects\r
 768          * \r
 769          */\r
 770         public static List<FastaSequence> openInputStream(String inFilePath)\r
 771                         throws IOException, UnknownFileFormatException {\r
 772 \r
 773                 // This stream gets closed in isValidClustalFile method\r
 774                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
 775                 // This stream is closed in the calling methods\r
 776                 InputStream inStr = new FileInputStream(inFilePath);\r
 777                 List<FastaSequence> fastaSeqs = null;\r
 778                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
 779                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
 780                         // alignment cannot be null see\r
 781                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
 782                         fastaSeqs = al.getSequences();\r
 783                 } else {\r
 784                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
 785                 }\r
 786                 return fastaSeqs;\r
 787         }\r
 788 \r
 789 }\r
 790 \r
 791 enum DisemblResult {\r
 792         /** These contains ranges and scores */\r
 793         COILS, REM465, HOTLOOPS\r
 794 }\r
 795 enum GlobProtResult {\r
 796         /** This a range with no scores */\r
 797         GlobDoms,\r
 798         /** This a range with no scores */\r
 799         Disorder,\r
 800         /** This a score with no range */\r
 801         Dydx,\r
 802         /** This a score with no range */\r
 803         SmoothedScore,\r
 804         /** This a score with no range */\r
 805         RawScore\r
 806 }\r
 807 \r
 808 enum IUPredResult {\r
 809         /**\r
 810          * Short disorder\r
 811          */\r
 812         Short,\r
 813         /**\r
 814          * Long disorder\r
 815          */\r
 816         Long,\r
 817         /**\r
 818          * Globular domains\r
 819          */\r
 820         Glob;\r
 821 \r
 822         static IUPredResult getType(File file) {\r
 823                 assert file != null;\r
 824                 String name = file.getName();\r
 825                 if (name.endsWith(Long.toString().toLowerCase())) {\r
 826                         return Long;\r
 827                 }\r
 828                 if (name.endsWith(Short.toString().toLowerCase())) {\r
 829                         return Short;\r
 830                 }\r
 831                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
 832                         return Glob;\r
 833                 }\r
 834                 throw new AssertionError(\r
 835                                 "IUPred result file type cannot be recognised! "\r
 836                                                 + "\nFile must ends with one of [glob, long or short]"\r
 837                                                 + "\n but given file name was: " + file.getName());\r
 838         }\r
 839 }