datamodel/compbio/data/sequence/SequenceUtil.java

   1 /*\r
   2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
   3  * Jalview Web Services version: 2.0 This library is free software; you can\r
   4  * redistribute it and/or modify it under the terms of the Apache License\r
   5  * version 2 as published by the Apache Software Foundation This library is\r
   6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
   7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
   8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
   9  * license is in apache_license.txt. It is also available here: see:\r
  10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
  11  * work distributed in source code form must include this copyright and license\r
  12  * notice.\r
  13  */\r
  14 \r
  15 package compbio.data.sequence;\r
  16 \r
  17 import java.io.BufferedReader;\r
  18 import java.io.BufferedWriter;\r
  19 import java.io.Closeable;\r
  20 import java.io.File;\r
  21 import java.io.FileInputStream;\r
  22 import java.io.IOException;\r
  23 import java.io.InputStream;\r
  24 import java.io.InputStreamReader;\r
  25 import java.io.OutputStream;\r
  26 import java.io.OutputStreamWriter;\r
  27 import java.util.ArrayList;\r
  28 import java.util.HashMap;\r
  29 import java.util.HashSet;\r
  30 import java.util.List;\r
  31 import java.util.Map;\r
  32 import java.util.Scanner;\r
  33 import java.util.Set;\r
  34 import java.util.TreeSet;\r
  35 import java.util.logging.Level;\r
  36 import java.util.regex.Matcher;\r
  37 import java.util.regex.Pattern;\r
  38 \r
  39 import compbio.util.Util;\r
  40 \r
  41 /**\r
  42  * Utility class for operations on sequences\r
  43  * \r
  44  * @author Petr Troshin\r
  45  * @version 1.0\r
  46  */\r
  47 public final class SequenceUtil {\r
  48 \r
  49         /**\r
  50          * A whitespace character: [\t\n\x0B\f\r]\r
  51          */\r
  52         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  53 \r
  54         /**\r
  55          * A digit\r
  56          */\r
  57         public static final Pattern DIGIT = Pattern.compile("\\d");\r
  58 \r
  59         /**\r
  60          * Non word\r
  61          */\r
  62         public static final Pattern NONWORD = Pattern.compile("\\W");\r
  63 \r
  64         /**\r
  65          * Valid Amino acids\r
  66          */\r
  67         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  68                         Pattern.CASE_INSENSITIVE);\r
  69 \r
  70         /**\r
  71          * inversion of AA pattern\r
  72          */\r
  73         public static final Pattern NON_AA = Pattern.compile(\r
  74                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  75 \r
  76         /**\r
  77          * Same as AA pattern but with two additional letters - XU\r
  78          */\r
  79         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  80                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  81 \r
  82         /**\r
  83          * Nucleotides a, t, g, c, u\r
  84          */\r
  85         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  86                         Pattern.CASE_INSENSITIVE);\r
  87 \r
  88         /**\r
  89          * Ambiguous nucleotide\r
  90          */\r
  91         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
  92                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
  93         /**\r
  94          * Non nucleotide\r
  95          */\r
  96         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
  97                         Pattern.CASE_INSENSITIVE);\r
  98 \r
  99         private SequenceUtil() {\r
 100         } // utility class, no instantiation\r
 101 \r
 102         /*\r
 103          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
 104          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
 105          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
 106          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
 107          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
 108          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
 109          * BufferedWriter fasta_out = new BufferedWriter( new\r
 110          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
 111          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
 112          * SysPrefs.newlinechar); fasta_out.close(); }\r
 113          */\r
 114 \r
 115         /**\r
 116          * @return true is the sequence contains only letters a,c, t, g, u\r
 117          */\r
 118         public static boolean isNucleotideSequence(final FastaSequence s) {\r
 119                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 120         }\r
 121 \r
 122         /**\r
 123          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 124          * (!) - B char\r
 125          */\r
 126         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 127                 sequence = SequenceUtil.cleanSequence(sequence);\r
 128                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 129                         return false;\r
 130                 }\r
 131                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 132                         return false;\r
 133                         /*\r
 134                          * System.out.format("I found the text starting at " +\r
 135                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 136                          * nonDNAmatcher.end());\r
 137                          */\r
 138                 }\r
 139                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 140                 return DNAmatcher.find();\r
 141         }\r
 142 \r
 143         /**\r
 144          * Removes all whitespace chars in the sequence string\r
 145          * \r
 146          * @param sequence\r
 147          * @return cleaned up sequence\r
 148          */\r
 149         public static String cleanSequence(String sequence) {\r
 150                 assert sequence != null;\r
 151                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 152                 sequence = m.replaceAll("").toUpperCase();\r
 153                 return sequence;\r
 154         }\r
 155 \r
 156         /**\r
 157          * Removes all special characters and digits as well as whitespace chars\r
 158          * from the sequence\r
 159          * \r
 160          * @param sequence\r
 161          * @return cleaned up sequence\r
 162          */\r
 163         public static String deepCleanSequence(String sequence) {\r
 164                 sequence = SequenceUtil.cleanSequence(sequence);\r
 165                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 166                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 167                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 168                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 169                 return sequence;\r
 170         }\r
 171 \r
 172         /**\r
 173          * Remove all non AA chars from the sequence\r
 174          * \r
 175          * @param sequence\r
 176          *            the sequence to clean\r
 177          * @return cleaned sequence\r
 178          */\r
 179         public static String cleanProteinSequence(String sequence) {\r
 180                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
 181         }\r
 182 \r
 183         /**\r
 184          * @param sequence\r
 185          * @return true is the sequence is a protein sequence, false overwise\r
 186          */\r
 187         public static boolean isProteinSequence(String sequence) {\r
 188                 sequence = SequenceUtil.cleanSequence(sequence);\r
 189                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 190                         return false;\r
 191                 }\r
 192                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 193                         return false;\r
 194                 }\r
 195                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 196                         return false;\r
 197                 }\r
 198                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 199                 return protmatcher.find();\r
 200         }\r
 201 \r
 202         /**\r
 203          * Check whether the sequence confirms to amboguous protein sequence\r
 204          * \r
 205          * @param sequence\r
 206          * @return return true only if the sequence if ambiguous protein sequence\r
 207          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 208          *         protein or DNA\r
 209          */\r
 210         public static boolean isAmbiguosProtein(String sequence) {\r
 211                 sequence = SequenceUtil.cleanSequence(sequence);\r
 212                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 213                         return false;\r
 214                 }\r
 215                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 216                         return false;\r
 217                 }\r
 218                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 219                         return false;\r
 220                 }\r
 221                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
 222                         return false;\r
 223                 }\r
 224                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 225                 return amb_prot.find();\r
 226         }\r
 227 \r
 228         /**\r
 229          * Writes list of FastaSequeces into the outstream formatting the sequence\r
 230          * so that it contains width chars on each line\r
 231          * \r
 232          * @param outstream\r
 233          * @param sequences\r
 234          * @param width\r
 235          *            - the maximum number of characters to write in one line\r
 236          * @throws IOException\r
 237          */\r
 238         public static void writeFasta(final OutputStream outstream,\r
 239                         final List<FastaSequence> sequences, final int width)\r
 240                         throws IOException {\r
 241                 writeFastaKeepTheStream(outstream, sequences, width);\r
 242                 outstream.close();\r
 243         }\r
 244 \r
 245         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
 246                         final List<FastaSequence> sequences, final int width)\r
 247                         throws IOException {\r
 248                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 249                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 250                 for (final FastaSequence fs : sequences) {\r
 251                         fastawriter.write(">" + fs.getId() + "\n");\r
 252                         fastawriter.write(fs.getFormatedSequence(width));\r
 253                         fastawriter.write("\n");\r
 254                 }\r
 255                 fastawriter.flush();\r
 256                 writer.flush();\r
 257         }\r
 258 \r
 259         /**\r
 260          * Reads fasta sequences from inStream into the list of FastaSequence\r
 261          * objects\r
 262          * \r
 263          * @param inStream\r
 264          *            from\r
 265          * @return list of FastaSequence objects\r
 266          * @throws IOException\r
 267          */\r
 268         public static List<FastaSequence> readFasta(final InputStream inStream)\r
 269                         throws IOException {\r
 270                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 271 \r
 272                 final BufferedReader infasta = new BufferedReader(\r
 273                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 274                 final Pattern pattern = Pattern.compile("//s+");\r
 275 \r
 276                 String line;\r
 277                 String sname = "", seqstr = null;\r
 278                 do {\r
 279                         line = infasta.readLine();\r
 280                         if ((line == null) || line.startsWith(">")) {\r
 281                                 if (seqstr != null) {\r
 282                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
 283                                 }\r
 284                                 sname = line; // remove >\r
 285                                 seqstr = "";\r
 286                         } else {\r
 287                                 final String subseq = pattern.matcher(line).replaceAll("");\r
 288                                 seqstr += subseq;\r
 289                         }\r
 290                 } while (line != null);\r
 291 \r
 292                 infasta.close();\r
 293                 return seqs;\r
 294         }\r
 295 \r
 296         /**\r
 297          * Writes FastaSequence in the file, each sequence will take one line only\r
 298          * \r
 299          * @param os\r
 300          * @param sequences\r
 301          * @throws IOException\r
 302          */\r
 303         public static void writeFasta(final OutputStream os,\r
 304                         final List<FastaSequence> sequences) throws IOException {\r
 305                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 306                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 307                 for (final FastaSequence fs : sequences) {\r
 308                         fasta_out.write(fs.getOnelineFasta());\r
 309                 }\r
 310                 fasta_out.close();\r
 311                 outWriter.close();\r
 312         }\r
 313 \r
 314         /**\r
 315          * Read IUPred output\r
 316          * \r
 317          * @param result\r
 318          * @return\r
 319          * @throws IOException\r
 320          * @throws UnknownFileFormatException\r
 321          */\r
 322         public static Map<String, Score> readIUPred(final File result)\r
 323                         throws IOException, UnknownFileFormatException {\r
 324                 InputStream input = new FileInputStream(result);\r
 325                 Map<String, Score> sequences = readIUPred(input,\r
 326                                 IUPredResult.getType(result));\r
 327                 input.close();\r
 328                 return sequences;\r
 329         }\r
 330 \r
 331         // Check the type of the file e.g. long| short or domain\r
 332         // and read\r
 333         /**\r
 334          * ## Long Disorder\r
 335          * \r
 336          * # P53_HUMAN\r
 337          * \r
 338          * 1 M 0.9943\r
 339          * \r
 340          * 2 E 0.9917\r
 341          * \r
 342          * 3 E 0.9879\r
 343          * \r
 344          * (every line)\r
 345          * \r
 346          * @throws IOException\r
 347          * @throws UnknownFileFormatException\r
 348          * \r
 349          * \r
 350          */\r
 351         private static Map<String, Score> readIUPred(InputStream input,\r
 352                         IUPredResult type) throws IOException, UnknownFileFormatException {\r
 353 \r
 354                 Score score = null;\r
 355                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 356                 Scanner scan = new Scanner(input);\r
 357                 scan.useDelimiter("#");\r
 358                 while (scan.hasNext()) {\r
 359                         String nextEntry = scan.next();\r
 360                         Scanner entry = new Scanner(nextEntry);\r
 361                         String name = entry.nextLine().trim();\r
 362                         // inside entry:\r
 363                         if (IUPredResult.Glob == type) {\r
 364                                 // parse domains\r
 365                                 TreeSet<Range> ranges = parseIUPredDomains(entry);\r
 366                                 score = new Score(type, ranges);\r
 367                         } else {\r
 368                                 // parse short | long\r
 369                                 float[] scores = parseIUPredScores(entry);\r
 370                                 score = new Score(type, scores);\r
 371                         }\r
 372                         entry.close();\r
 373                         seqs.put(name, score);\r
 374                 }\r
 375 \r
 376                 scan.close();\r
 377                 return seqs;\r
 378         }\r
 379 \r
 380         /**\r
 381          * # P53_HUMA\r
 382          * \r
 383          * Number of globular domains: 2\r
 384          * \r
 385          * globular domain 1. 98 - 269\r
 386          * \r
 387          * globular domain 2. 431 - 482\r
 388          * \r
 389          * >P53_HUMA\r
 390          * \r
 391          * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp\r
 392          * \r
 393          * @param scan\r
 394          */\r
 395         private static TreeSet<Range> parseIUPredDomains(Scanner scan) {\r
 396                 String header = "Number of globular domains:";\r
 397                 String domainPref = "globular domain";\r
 398                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 399                 String line = scan.nextLine().trim();\r
 400                 assert line.startsWith(header);\r
 401                 line = line.substring(header.length()).trim();\r
 402                 int domainNum = Integer.parseInt(line);\r
 403                 if (domainNum == 0) {\r
 404                         return ranges;\r
 405                 }\r
 406 \r
 407                 for (int i = 0; i < domainNum; i++) {\r
 408                         assert scan.hasNextLine();\r
 409                         line = scan.nextLine();\r
 410                         assert line.trim().startsWith(domainPref);\r
 411                         line = line.substring(line.indexOf(".") + 1).trim();\r
 412                         Range r = new Range(line.split("-"));\r
 413                         ranges.add(r);\r
 414                 }\r
 415 \r
 416                 return ranges;\r
 417         }\r
 418         /*\r
 419          * 1 M 0.9943\r
 420          * \r
 421          * 2 E 0.9917\r
 422          */\r
 423         private static float[] parseIUPredScores(Scanner scan)\r
 424                         throws UnknownFileFormatException {\r
 425                 List<String> annotation = new ArrayList<String>();\r
 426                 while (scan.hasNextLine()) {\r
 427                         String line = scan.nextLine().trim();\r
 428                         String[] val = line.split("\\s+");\r
 429                         annotation.add(val[2]);\r
 430                 }\r
 431                 return convertToNumber(annotation\r
 432                                 .toArray(new String[annotation.size()]));\r
 433         }\r
 434 \r
 435         public static Map<String, Score> readJRonn(final File result)\r
 436                         throws IOException, UnknownFileFormatException {\r
 437                 InputStream input = new FileInputStream(result);\r
 438                 Map<String, Score> sequences = readJRonn(input);\r
 439                 input.close();\r
 440                 return sequences;\r
 441         }\r
 442 \r
 443         /**\r
 444          * Reader for JRonn horizontal file format\r
 445          * \r
 446          * <pre>\r
 447          * &gtFoobar M G D T T A G 0.48 0.42\r
 448          * 0.42 0.48 0.52 0.53 0.54\r
 449          * \r
 450          * <pre>\r
 451          * Where all values are tab delimited\r
 452          * \r
 453          * @param inStream\r
 454          *            the InputStream connected to the JRonn output file\r
 455          * @return List of {@link AnnotatedSequence} objects\r
 456          * @throws IOException\r
 457          *             is thrown if the inStream has problems accessing the data\r
 458          * @throws UnknownFileFormatException\r
 459          *             is thrown if the inStream represents an unknown source of\r
 460          * data, i.e. not a JRonn output\r
 461          */\r
 462         public static Map<String, Score> readJRonn(final InputStream inStream)\r
 463                         throws IOException, UnknownFileFormatException {\r
 464                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 465 \r
 466                 final BufferedReader infasta = new BufferedReader(\r
 467                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 468 \r
 469                 String line;\r
 470                 String sname = "";\r
 471                 do {\r
 472                         line = infasta.readLine();\r
 473                         if (line == null || line.isEmpty()) {\r
 474                                 // skip empty lines\r
 475                                 continue;\r
 476                         }\r
 477                         if (line.startsWith(">")) {\r
 478                                 // read name\r
 479                                 sname = line.trim().substring(1);\r
 480                                 // read sequence line\r
 481                                 line = infasta.readLine();\r
 482                                 final String sequence = line.replace("\t", "");\r
 483                                 // read annotation line\r
 484                                 line = infasta.readLine();\r
 485                                 String[] annotValues = line.split("\t");\r
 486                                 float[] annotation = convertToNumber(annotValues);\r
 487                                 if (annotation.length != sequence.length()) {\r
 488                                         throw new UnknownFileFormatException(\r
 489                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
 490                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
 491                                 }\r
 492                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
 493                         }\r
 494                 } while (line != null);\r
 495 \r
 496                 infasta.close();\r
 497                 return seqs;\r
 498         }\r
 499 \r
 500         private static float[] convertToNumber(String[] annotValues)\r
 501                         throws UnknownFileFormatException {\r
 502                 float[] annotation = new float[annotValues.length];\r
 503                 try {\r
 504                         for (int i = 0; i < annotation.length; i++) {\r
 505                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 506                         }\r
 507                 } catch (NumberFormatException e) {\r
 508                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
 509                                         e.getCause());\r
 510                 }\r
 511                 return annotation;\r
 512         }\r
 513 \r
 514         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 515                         + ">sequence_name\n "\r
 516                         + "M    V       S\n"\r
 517                         + "0.43 0.22    0.65\n"\r
 518                         + "Where first line is the sequence name,\n"\r
 519                         + "second line is the tab delimited sequence,\n"\r
 520                         + "third line contains tab delimited disorder prediction values.\n"\r
 521                         + "No lines are allowed between these three. Additionally, the number of  "\r
 522                         + "sequence residues must be equal to the number of the disorder values.";\r
 523 \r
 524         /**\r
 525          * Closes the Closable and logs the exception if any\r
 526          * \r
 527          * @param log\r
 528          * @param stream\r
 529          */\r
 530         public final static void closeSilently(java.util.logging.Logger log,\r
 531                         Closeable stream) {\r
 532                 if (stream != null) {\r
 533                         try {\r
 534                                 stream.close();\r
 535                         } catch (IOException e) {\r
 536                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 537                         }\r
 538                 }\r
 539         }\r
 540 \r
 541         /**\r
 542          * \r
 543          > Foobar_dundeefriends\r
 544          * \r
 545          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 546          * \r
 547          * # REM465 355-368\r
 548          * \r
 549          * # HOTLOOPS 190-204\r
 550          * \r
 551          * # RESIDUE COILS REM465 HOTLOOPS\r
 552          * \r
 553          * M 0.86010 0.88512 0.37094\r
 554          * \r
 555          * T 0.79983 0.85864 0.44331\r
 556          * \r
 557          * >Next Sequence name\r
 558          * \r
 559          * \r
 560          * @param input\r
 561          * @return\r
 562          * @throws IOException\r
 563          * @throws UnknownFileFormatException\r
 564          */\r
 565         public static HashMap<String, Set<Score>> readDisembl(\r
 566                         final InputStream input) throws IOException,\r
 567                         UnknownFileFormatException {\r
 568                 Scanner scan = new Scanner(input);\r
 569                 scan.useDelimiter(">");\r
 570                 if (!scan.hasNext()) {\r
 571                         throw new UnknownFileFormatException(\r
 572                                         "In Disembl score format each sequence score is expected "\r
 573                                                         + "to start from the line: >Sequence name "\r
 574                                                         + " No such line was found!");\r
 575                 }\r
 576 \r
 577                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 578                 int seqCounter = 0;\r
 579                 while (scan.hasNext()) {\r
 580                         seqCounter++;\r
 581                         String singleSeq = scan.next();\r
 582                         Scanner scansingle = new Scanner(singleSeq);\r
 583                         if (!scansingle.hasNextLine()) {\r
 584                                 throw new RuntimeException(\r
 585                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
 586                         }\r
 587 \r
 588                         StringBuffer seqbuffer = new StringBuffer();\r
 589                         ArrayList<Float> coils = new ArrayList<Float>();\r
 590                         ArrayList<Float> rem = new ArrayList<Float>();\r
 591                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
 592 \r
 593                         String sequenceName = scansingle.nextLine().trim();\r
 594                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
 595                                         scansingle.nextLine());\r
 596                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
 597                                         scansingle.nextLine());\r
 598                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
 599                                         scansingle.nextLine());\r
 600 \r
 601                         String title = scansingle.nextLine();\r
 602                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
 603 \r
 604                         while (scansingle.hasNext()) {\r
 605                                 seqbuffer.append(scansingle.next());\r
 606                                 coils.add(scansingle.nextFloat());\r
 607                                 rem.add(scansingle.nextFloat());\r
 608                                 hotloops.add(scansingle.nextFloat());\r
 609                         }\r
 610                         /*\r
 611                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 612                          * seqbuffer.toString());\r
 613                          */\r
 614                         HashSet<Score> scores = new HashSet<Score>();\r
 615                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
 616                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
 617                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
 618                         results.put(sequenceName, scores);\r
 619 \r
 620                         scansingle.close();\r
 621                 }\r
 622                 scan.close();\r
 623                 input.close();\r
 624                 return results;\r
 625         }\r
 626 \r
 627         /**\r
 628          * Parsing:\r
 629          * \r
 630          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
 631          * 350-391, 429-485, 497-506, 539-547\r
 632          * \r
 633          * # REM465 355-368\r
 634          * \r
 635          * # HOTLOOPS 190-204\r
 636          * \r
 637          * @param lines\r
 638          * @return\r
 639          */\r
 640         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
 641                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 642 \r
 643                 Scanner scan = new Scanner(lines);\r
 644 \r
 645                 assert scan.hasNext();\r
 646                 String del = scan.next();\r
 647                 assert "#".equals(del); // pass delimiter #\r
 648                 String type = scan.next(); // pass enum name e.g. COILS\r
 649                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
 650                                 + resultType.toString();\r
 651 \r
 652                 // beginning of the ranges\r
 653                 scan.useDelimiter(",");\r
 654                 while (scan.hasNext()) {\r
 655                         String range = scan.next();\r
 656                         if (!Util.isEmpty(range)) {\r
 657                                 ranges.add(new Range(range.split("-")));\r
 658                         }\r
 659                 }\r
 660                 return ranges;\r
 661         }\r
 662 \r
 663         /**\r
 664          * \r
 665          > Foobar_dundeefriends\r
 666          * \r
 667          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 668          * \r
 669          * # REM465 355-368\r
 670          * \r
 671          * # HOTLOOPS 190-204\r
 672          * \r
 673          * # RESIDUE COILS REM465 HOTLOOPS\r
 674          * \r
 675          * M 0.86010 0.88512 0.37094\r
 676          * \r
 677          * T 0.79983 0.85864 0.44331\r
 678          * \r
 679          * >Next Sequence name\r
 680          * \r
 681          * \r
 682          * @param input\r
 683          * @return\r
 684          * @throws IOException\r
 685          * @throws UnknownFileFormatException\r
 686          */\r
 687         public static HashMap<String, Set<Score>> readGlobPlot(\r
 688                         final InputStream input) throws IOException,\r
 689                         UnknownFileFormatException {\r
 690                 Scanner scan = new Scanner(input);\r
 691                 scan.useDelimiter(">");\r
 692                 if (!scan.hasNext()) {\r
 693                         throw new UnknownFileFormatException(\r
 694                                         "In GlobPlot score format each sequence score is expected "\r
 695                                                         + "to start from the line: >Sequence name "\r
 696                                                         + " No such line was found!");\r
 697                 }\r
 698 \r
 699                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 700                 int seqCounter = 0;\r
 701                 while (scan.hasNext()) {\r
 702                         seqCounter++;\r
 703                         String singleSeq = scan.next();\r
 704                         Scanner scansingle = new Scanner(singleSeq);\r
 705                         if (!scansingle.hasNextLine()) {\r
 706                                 throw new RuntimeException(\r
 707                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
 708                         }\r
 709 \r
 710                         StringBuffer seqbuffer = new StringBuffer();\r
 711                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
 712                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
 713                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
 714 \r
 715                         String sequenceName = scansingle.nextLine().trim();\r
 716                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
 717                                         scansingle.nextLine());\r
 718                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
 719                                         scansingle.nextLine());\r
 720 \r
 721                         String title = scansingle.nextLine();\r
 722                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
 723 \r
 724                         while (scansingle.hasNext()) {\r
 725                                 seqbuffer.append(scansingle.next());\r
 726                                 dydxScore.add(scansingle.nextFloat());\r
 727                                 rawScore.add(scansingle.nextFloat());\r
 728                                 smoothedScore.add(scansingle.nextFloat());\r
 729                         }\r
 730                         /*\r
 731                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 732                          * seqbuffer.toString());\r
 733                          */\r
 734                         Set<Score> scores = new TreeSet<Score>();\r
 735                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
 736                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
 737                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
 738                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
 739                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
 740                         results.put(sequenceName, scores);\r
 741 \r
 742                         scansingle.close();\r
 743                 }\r
 744                 scan.close();\r
 745                 input.close();\r
 746                 return results;\r
 747         }\r
 748         /**\r
 749          * Read AACon result with no alignment files. This method leaves incoming\r
 750          * InputStream open!\r
 751          * \r
 752          * @param results\r
 753          *            output file of AAConservation\r
 754          * @return Map with keys {@link ConservationMethod} -> float[]\r
 755          */\r
 756         public static HashSet<Score> readAAConResults(InputStream results) {\r
 757                 if (results == null) {\r
 758                         throw new NullPointerException(\r
 759                                         "InputStream with results must be provided");\r
 760                 }\r
 761                 HashSet<Score> annotations = new HashSet<Score>();\r
 762                 Scanner sc = new Scanner(results);\r
 763                 sc.useDelimiter("#");\r
 764                 while (sc.hasNext()) {\r
 765                         String line = sc.next();\r
 766                         int spacePos = line.indexOf(" ");\r
 767                         assert spacePos > 0 : "Space is expected as delimited between method "\r
 768                                         + "name and values!";\r
 769                         String methodLine = line.substring(0, spacePos);\r
 770                         ConservationMethod method = ConservationMethod\r
 771                                         .getMethod(methodLine);\r
 772                         assert method != null : "Method " + methodLine\r
 773                                         + " is not recognized! ";\r
 774                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
 775                         ArrayList<Float> values = new ArrayList<Float>();\r
 776                         while (valuesScanner.hasNextDouble()) {\r
 777                                 Double value = valuesScanner.nextDouble();\r
 778                                 values.add(value.floatValue());\r
 779                         }\r
 780                         annotations.add(new Score(method, values));\r
 781                 }\r
 782                 return annotations;\r
 783         }\r
 784 \r
 785         /**\r
 786          * Reads and parses Fasta or Clustal formatted file into a list of\r
 787          * FastaSequence objects\r
 788          * \r
 789          * @param inFilePath\r
 790          *            the path to the input file\r
 791          * @throws IOException\r
 792          *             if the file denoted by inFilePath cannot be read\r
 793          * @throws UnknownFileFormatException\r
 794          *             if the inFilePath points to the file which format cannot be\r
 795          *             recognised\r
 796          * @return the List of FastaSequence objects\r
 797          * \r
 798          */\r
 799         public static List<FastaSequence> openInputStream(String inFilePath)\r
 800                         throws IOException, UnknownFileFormatException {\r
 801 \r
 802                 // This stream gets closed in isValidClustalFile method\r
 803                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
 804                 // This stream is closed in the calling methods\r
 805                 InputStream inStr = new FileInputStream(inFilePath);\r
 806                 List<FastaSequence> fastaSeqs = null;\r
 807                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
 808                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
 809                         // alignment cannot be null see\r
 810                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
 811                         fastaSeqs = al.getSequences();\r
 812                 } else {\r
 813                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
 814                 }\r
 815                 return fastaSeqs;\r
 816         }\r
 817 \r
 818 }\r
 819 \r
 820 enum DisemblResult {\r
 821         /** These contains ranges and scores */\r
 822         COILS, REM465, HOTLOOPS\r
 823 }\r
 824 enum GlobProtResult {\r
 825         /** This a range with no scores */\r
 826         GlobDoms,\r
 827         /** This a range with no scores */\r
 828         Disorder,\r
 829         /** This a score with no range */\r
 830         Dydx,\r
 831         /** This a score with no range */\r
 832         SmoothedScore,\r
 833         /** This a score with no range */\r
 834         RawScore\r
 835 }\r
 836 \r
 837 enum IUPredResult {\r
 838         /**\r
 839          * Short disorder\r
 840          */\r
 841         Short,\r
 842         /**\r
 843          * Long disorder\r
 844          */\r
 845         Long,\r
 846         /**\r
 847          * Globular domains\r
 848          */\r
 849         Glob;\r
 850 \r
 851         static IUPredResult getType(File file) {\r
 852                 assert file != null;\r
 853                 String name = file.getName();\r
 854                 if (name.endsWith(Long.toString().toLowerCase())) {\r
 855                         return Long;\r
 856                 }\r
 857                 if (name.endsWith(Short.toString().toLowerCase())) {\r
 858                         return Short;\r
 859                 }\r
 860                 if (name.endsWith(Glob.toString().toLowerCase())) {\r
 861                         return Glob;\r
 862                 }\r
 863                 throw new AssertionError(\r
 864                                 "IUPred result file type cannot be recognised! "\r
 865                                                 + "\nFile must ends with one of [glob, long or short]"\r
 866                                                 + "\n but given file name was: " + file.getName());\r
 867         }\r
 868 }