datamodel/compbio/data/sequence/SequenceUtil.java

   1 /*\r
   2  * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin\r
   3  * Jalview Web Services version: 2.0 This library is free software; you can\r
   4  * redistribute it and/or modify it under the terms of the Apache License\r
   5  * version 2 as published by the Apache Software Foundation This library is\r
   6  * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;\r
   7  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A\r
   8  * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the\r
   9  * license is in apache_license.txt. It is also available here: see:\r
  10  * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived\r
  11  * work distributed in source code form must include this copyright and license\r
  12  * notice.\r
  13  */\r
  14 \r
  15 package compbio.data.sequence;\r
  16 \r
  17 import java.io.BufferedReader;\r
  18 import java.io.BufferedWriter;\r
  19 import java.io.Closeable;\r
  20 import java.io.File;\r
  21 import java.io.FileInputStream;\r
  22 import java.io.IOException;\r
  23 import java.io.InputStream;\r
  24 import java.io.InputStreamReader;\r
  25 import java.io.OutputStream;\r
  26 import java.io.OutputStreamWriter;\r
  27 import java.util.ArrayList;\r
  28 import java.util.HashMap;\r
  29 import java.util.HashSet;\r
  30 import java.util.List;\r
  31 import java.util.Map;\r
  32 import java.util.Scanner;\r
  33 import java.util.Set;\r
  34 import java.util.TreeSet;\r
  35 import java.util.logging.Level;\r
  36 import java.util.regex.Matcher;\r
  37 import java.util.regex.Pattern;\r
  38 \r
  39 import compbio.util.Util;\r
  40 \r
  41 /**\r
  42  * Utility class for operations on sequences\r
  43  * \r
  44  * @author Petr Troshin\r
  45  * @version 1.0\r
  46  */\r
  47 public final class SequenceUtil {\r
  48 \r
  49         /**\r
  50          * A whitespace character: [\t\n\x0B\f\r]\r
  51          */\r
  52         public static final Pattern WHITE_SPACE = Pattern.compile("\\s");\r
  53 \r
  54         /**\r
  55          * A digit\r
  56          */\r
  57         public static final Pattern DIGIT = Pattern.compile("\\d");\r
  58 \r
  59         /**\r
  60          * Non word\r
  61          */\r
  62         public static final Pattern NONWORD = Pattern.compile("\\W");\r
  63 \r
  64         /**\r
  65          * Valid Amino acids\r
  66          */\r
  67         public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",\r
  68                         Pattern.CASE_INSENSITIVE);\r
  69 \r
  70         /**\r
  71          * inversion of AA pattern\r
  72          */\r
  73         public static final Pattern NON_AA = Pattern.compile(\r
  74                         "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);\r
  75 \r
  76         /**\r
  77          * Same as AA pattern but with two additional letters - XU\r
  78          */\r
  79         public static final Pattern AMBIGUOUS_AA = Pattern.compile(\r
  80                         "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);\r
  81 \r
  82         /**\r
  83          * Nucleotides a, t, g, c, u\r
  84          */\r
  85         public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",\r
  86                         Pattern.CASE_INSENSITIVE);\r
  87 \r
  88         /**\r
  89          * Ambiguous nucleotide\r
  90          */\r
  91         public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(\r
  92                         "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC\r
  93         /**\r
  94          * Non nucleotide\r
  95          */\r
  96         public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",\r
  97                         Pattern.CASE_INSENSITIVE);\r
  98 \r
  99         private SequenceUtil() {\r
 100         } // utility class, no instantiation\r
 101 \r
 102         /*\r
 103          * public static void write_PirSeq(OutputStream os, FastaSequence seq)\r
 104          * throws IOException { BufferedWriter pir_out = new BufferedWriter(new\r
 105          * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() +\r
 106          * SysPrefs.newlinechar); pir_out.write(seq.getSequence() +\r
 107          * SysPrefs.newlinechar); pir_out.close(); } public static void\r
 108          * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException {\r
 109          * BufferedWriter fasta_out = new BufferedWriter( new\r
 110          * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() +\r
 111          * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() +\r
 112          * SysPrefs.newlinechar); fasta_out.close(); }\r
 113          */\r
 114 \r
 115         /**\r
 116          * @return true is the sequence contains only letters a,c, t, g, u\r
 117          */\r
 118         public static boolean isNucleotideSequence(final FastaSequence s) {\r
 119                 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());\r
 120         }\r
 121 \r
 122         /**\r
 123          * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one\r
 124          * (!) - B char\r
 125          */\r
 126         public static boolean isNonAmbNucleotideSequence(String sequence) {\r
 127                 sequence = SequenceUtil.cleanSequence(sequence);\r
 128                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 129                         return false;\r
 130                 }\r
 131                 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {\r
 132                         return false;\r
 133                         /*\r
 134                          * System.out.format("I found the text starting at " +\r
 135                          * "index %d and ending at index %d.%n", nonDNAmatcher .start(),\r
 136                          * nonDNAmatcher.end());\r
 137                          */\r
 138                 }\r
 139                 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);\r
 140                 return DNAmatcher.find();\r
 141         }\r
 142 \r
 143         /**\r
 144          * Removes all whitespace chars in the sequence string\r
 145          * \r
 146          * @param sequence\r
 147          * @return cleaned up sequence\r
 148          */\r
 149         public static String cleanSequence(String sequence) {\r
 150                 assert sequence != null;\r
 151                 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);\r
 152                 sequence = m.replaceAll("").toUpperCase();\r
 153                 return sequence;\r
 154         }\r
 155 \r
 156         /**\r
 157          * Removes all special characters and digits as well as whitespace chars\r
 158          * from the sequence\r
 159          * \r
 160          * @param sequence\r
 161          * @return cleaned up sequence\r
 162          */\r
 163         public static String deepCleanSequence(String sequence) {\r
 164                 sequence = SequenceUtil.cleanSequence(sequence);\r
 165                 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");\r
 166                 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");\r
 167                 final Pattern othernonSeqChars = Pattern.compile("[_-]+");\r
 168                 sequence = othernonSeqChars.matcher(sequence).replaceAll("");\r
 169                 return sequence;\r
 170         }\r
 171 \r
 172         /**\r
 173          * Remove all non AA chars from the sequence\r
 174          * \r
 175          * @param sequence\r
 176          *            the sequence to clean\r
 177          * @return cleaned sequence\r
 178          */\r
 179         public static String cleanProteinSequence(String sequence) {\r
 180                 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");\r
 181         }\r
 182 \r
 183         /**\r
 184          * @param sequence\r
 185          * @return true is the sequence is a protein sequence, false overwise\r
 186          */\r
 187         public static boolean isProteinSequence(String sequence) {\r
 188                 sequence = SequenceUtil.cleanSequence(sequence);\r
 189                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 190                         return false;\r
 191                 }\r
 192                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 193                         return false;\r
 194                 }\r
 195                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 196                         return false;\r
 197                 }\r
 198                 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);\r
 199                 return protmatcher.find();\r
 200         }\r
 201 \r
 202         /**\r
 203          * Check whether the sequence confirms to amboguous protein sequence\r
 204          * \r
 205          * @param sequence\r
 206          * @return return true only if the sequence if ambiguous protein sequence\r
 207          *         Return false otherwise. e.g. if the sequence is non-ambiguous\r
 208          *         protein or DNA\r
 209          */\r
 210         public static boolean isAmbiguosProtein(String sequence) {\r
 211                 sequence = SequenceUtil.cleanSequence(sequence);\r
 212                 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {\r
 213                         return false;\r
 214                 }\r
 215                 if (SequenceUtil.DIGIT.matcher(sequence).find()) {\r
 216                         return false;\r
 217                 }\r
 218                 if (SequenceUtil.NON_AA.matcher(sequence).find()) {\r
 219                         return false;\r
 220                 }\r
 221                 if (SequenceUtil.AA.matcher(sequence).find()) {\r
 222                         return false;\r
 223                 }\r
 224                 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);\r
 225                 return amb_prot.find();\r
 226         }\r
 227 \r
 228         /**\r
 229          * Writes list of FastaSequeces into the outstream formatting the sequence\r
 230          * so that it contains width chars on each line\r
 231          * \r
 232          * @param outstream\r
 233          * @param sequences\r
 234          * @param width\r
 235          *            - the maximum number of characters to write in one line\r
 236          * @throws IOException\r
 237          */\r
 238         public static void writeFasta(final OutputStream outstream,\r
 239                         final List<FastaSequence> sequences, final int width)\r
 240                         throws IOException {\r
 241                 writeFastaKeepTheStream(outstream, sequences, width);\r
 242                 outstream.close();\r
 243         }\r
 244 \r
 245         public static void writeFastaKeepTheStream(final OutputStream outstream,\r
 246                         final List<FastaSequence> sequences, final int width)\r
 247                         throws IOException {\r
 248                 final OutputStreamWriter writer = new OutputStreamWriter(outstream);\r
 249                 final BufferedWriter fastawriter = new BufferedWriter(writer);\r
 250                 for (final FastaSequence fs : sequences) {\r
 251                         fastawriter.write(">" + fs.getId() + "\n");\r
 252                         fastawriter.write(fs.getFormatedSequence(width));\r
 253                         fastawriter.write("\n");\r
 254                 }\r
 255                 fastawriter.flush();\r
 256                 writer.flush();\r
 257         }\r
 258 \r
 259         /**\r
 260          * Reads fasta sequences from inStream into the list of FastaSequence\r
 261          * objects\r
 262          * \r
 263          * @param inStream\r
 264          *            from\r
 265          * @return list of FastaSequence objects\r
 266          * @throws IOException\r
 267          */\r
 268         public static List<FastaSequence> readFasta(final InputStream inStream)\r
 269                         throws IOException {\r
 270                 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();\r
 271 \r
 272                 final BufferedReader infasta = new BufferedReader(\r
 273                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 274                 final Pattern pattern = Pattern.compile("//s+");\r
 275 \r
 276                 String line;\r
 277                 String sname = "", seqstr = null;\r
 278                 do {\r
 279                         line = infasta.readLine();\r
 280                         if ((line == null) || line.startsWith(">")) {\r
 281                                 if (seqstr != null) {\r
 282                                         seqs.add(new FastaSequence(sname.substring(1), seqstr));\r
 283                                 }\r
 284                                 sname = line; // remove >\r
 285                                 seqstr = "";\r
 286                         } else {\r
 287                                 final String subseq = pattern.matcher(line).replaceAll("");\r
 288                                 seqstr += subseq;\r
 289                         }\r
 290                 } while (line != null);\r
 291 \r
 292                 infasta.close();\r
 293                 return seqs;\r
 294         }\r
 295 \r
 296         /**\r
 297          * Writes FastaSequence in the file, each sequence will take one line only\r
 298          * \r
 299          * @param os\r
 300          * @param sequences\r
 301          * @throws IOException\r
 302          */\r
 303         public static void writeFasta(final OutputStream os,\r
 304                         final List<FastaSequence> sequences) throws IOException {\r
 305                 final OutputStreamWriter outWriter = new OutputStreamWriter(os);\r
 306                 final BufferedWriter fasta_out = new BufferedWriter(outWriter);\r
 307                 for (final FastaSequence fs : sequences) {\r
 308                         fasta_out.write(fs.getOnelineFasta());\r
 309                 }\r
 310                 fasta_out.close();\r
 311                 outWriter.close();\r
 312         }\r
 313 \r
 314         public static Map<String, Score> readJRonn(final File result)\r
 315                         throws IOException, UnknownFileFormatException {\r
 316                 InputStream input = new FileInputStream(result);\r
 317                 Map<String, Score> sequences = readJRonn(input);\r
 318                 input.close();\r
 319                 return sequences;\r
 320         }\r
 321 \r
 322         /**\r
 323          * Reader for JRonn horizontal file format\r
 324          * \r
 325          * <pre>\r
 326          * &gtFoobar M G D T T A G 0.48 0.42\r
 327          * 0.42 0.48 0.52 0.53 0.54\r
 328          * \r
 329          * <pre>\r
 330          * Where all values are tab delimited\r
 331          * \r
 332          * @param inStream\r
 333          *            the InputStream connected to the JRonn output file\r
 334          * @return List of {@link AnnotatedSequence} objects\r
 335          * @throws IOException\r
 336          *             is thrown if the inStream has problems accessing the data\r
 337          * @throws UnknownFileFormatException\r
 338          *             is thrown if the inStream represents an unknown source of\r
 339          * data, i.e. not a JRonn output\r
 340          */\r
 341         public static Map<String, Score> readJRonn(final InputStream inStream)\r
 342                         throws IOException, UnknownFileFormatException {\r
 343                 final Map<String, Score> seqs = new HashMap<String, Score>();\r
 344 \r
 345                 final BufferedReader infasta = new BufferedReader(\r
 346                                 new InputStreamReader(inStream, "UTF8"), 16000);\r
 347 \r
 348                 String line;\r
 349                 String sname = "";\r
 350                 do {\r
 351                         line = infasta.readLine();\r
 352                         if (line == null || line.isEmpty()) {\r
 353                                 // skip empty lines\r
 354                                 continue;\r
 355                         }\r
 356                         if (line.startsWith(">")) {\r
 357                                 // read name\r
 358                                 sname = line.trim().substring(1);\r
 359                                 // read sequence line\r
 360                                 line = infasta.readLine();\r
 361                                 final String sequence = line.replace("\t", "");\r
 362                                 // read annotation line\r
 363                                 line = infasta.readLine();\r
 364                                 String[] annotValues = line.split("\t");\r
 365                                 float[] annotation = convertToNumber(annotValues);\r
 366                                 if (annotation.length != sequence.length()) {\r
 367                                         throw new UnknownFileFormatException(\r
 368                                                         "File does not look like Jronn horizontally formatted output file!\n"\r
 369                                                                         + JRONN_WRONG_FORMAT_MESSAGE);\r
 370                                 }\r
 371                                 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));\r
 372                         }\r
 373                 } while (line != null);\r
 374 \r
 375                 infasta.close();\r
 376                 return seqs;\r
 377         }\r
 378 \r
 379         private static float[] convertToNumber(String[] annotValues)\r
 380                         throws UnknownFileFormatException {\r
 381                 float[] annotation = new float[annotValues.length];\r
 382                 try {\r
 383                         for (int i = 0; i < annotation.length; i++) {\r
 384                                 annotation[i] = Float.parseFloat(annotValues[i]);\r
 385                         }\r
 386                 } catch (NumberFormatException e) {\r
 387                         throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,\r
 388                                         e.getCause());\r
 389                 }\r
 390                 return annotation;\r
 391         }\r
 392 \r
 393         private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"\r
 394                         + ">sequence_name\n "\r
 395                         + "M    V       S\n"\r
 396                         + "0.43 0.22    0.65\n"\r
 397                         + "Where first line is the sequence name,\n"\r
 398                         + "second line is the tab delimited sequence,\n"\r
 399                         + "third line contains tab delimited disorder prediction values.\n"\r
 400                         + "No lines are allowed between these three. Additionally, the number of  "\r
 401                         + "sequence residues must be equal to the number of the disorder values.";\r
 402 \r
 403         /**\r
 404          * Closes the Closable and logs the exception if any\r
 405          * \r
 406          * @param log\r
 407          * @param stream\r
 408          */\r
 409         public final static void closeSilently(java.util.logging.Logger log,\r
 410                         Closeable stream) {\r
 411                 if (stream != null) {\r
 412                         try {\r
 413                                 stream.close();\r
 414                         } catch (IOException e) {\r
 415                                 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());\r
 416                         }\r
 417                 }\r
 418         }\r
 419 \r
 420         /**\r
 421          * \r
 422          > Foobar_dundeefriends\r
 423          * \r
 424          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 425          * \r
 426          * # REM465 355-368\r
 427          * \r
 428          * # HOTLOOPS 190-204\r
 429          * \r
 430          * # RESIDUE COILS REM465 HOTLOOPS\r
 431          * \r
 432          * M 0.86010 0.88512 0.37094\r
 433          * \r
 434          * T 0.79983 0.85864 0.44331\r
 435          * \r
 436          * >Next Sequence name\r
 437          * \r
 438          * \r
 439          * @param input\r
 440          * @return\r
 441          * @throws IOException\r
 442          * @throws UnknownFileFormatException\r
 443          */\r
 444         public static HashMap<String, Set<Score>> readDisembl(\r
 445                         final InputStream input) throws IOException,\r
 446                         UnknownFileFormatException {\r
 447                 Scanner scan = new Scanner(input);\r
 448                 scan.useDelimiter(">");\r
 449                 if (!scan.hasNext()) {\r
 450                         throw new UnknownFileFormatException(\r
 451                                         "In Disembl score format each sequence score is expected "\r
 452                                                         + "to start from the line: >Sequence name "\r
 453                                                         + " No such line was found!");\r
 454                 }\r
 455 \r
 456                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 457                 int seqCounter = 0;\r
 458                 while (scan.hasNext()) {\r
 459                         seqCounter++;\r
 460                         String singleSeq = scan.next();\r
 461                         Scanner scansingle = new Scanner(singleSeq);\r
 462                         if (!scansingle.hasNextLine()) {\r
 463                                 throw new RuntimeException(\r
 464                                                 "The input looks like an incomplete disembl file - cannot parse!");\r
 465                         }\r
 466 \r
 467                         StringBuffer seqbuffer = new StringBuffer();\r
 468                         ArrayList<Float> coils = new ArrayList<Float>();\r
 469                         ArrayList<Float> rem = new ArrayList<Float>();\r
 470                         ArrayList<Float> hotloops = new ArrayList<Float>();\r
 471 \r
 472                         String sequenceName = scansingle.nextLine().trim();\r
 473                         TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,\r
 474                                         scansingle.nextLine());\r
 475                         TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,\r
 476                                         scansingle.nextLine());\r
 477                         TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,\r
 478                                         scansingle.nextLine());\r
 479 \r
 480                         String title = scansingle.nextLine();\r
 481                         assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";\r
 482 \r
 483                         while (scansingle.hasNext()) {\r
 484                                 seqbuffer.append(scansingle.next());\r
 485                                 coils.add(scansingle.nextFloat());\r
 486                                 rem.add(scansingle.nextFloat());\r
 487                                 hotloops.add(scansingle.nextFloat());\r
 488                         }\r
 489                         /*\r
 490                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 491                          * seqbuffer.toString());\r
 492                          */\r
 493                         HashSet<Score> scores = new HashSet<Score>();\r
 494                         scores.add(new Score(DisemblResult.COILS, coils, coilsR));\r
 495                         scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R));\r
 496                         scores.add(new Score(DisemblResult.REM465, rem, loopsR));\r
 497                         results.put(sequenceName, scores);\r
 498 \r
 499                         scansingle.close();\r
 500                 }\r
 501                 scan.close();\r
 502                 input.close();\r
 503                 return results;\r
 504         }\r
 505 \r
 506         /**\r
 507          * Parsing:\r
 508          * \r
 509          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,\r
 510          * 350-391, 429-485, 497-506, 539-547\r
 511          * \r
 512          * # REM465 355-368\r
 513          * \r
 514          * # HOTLOOPS 190-204\r
 515          * \r
 516          * @param lines\r
 517          * @return\r
 518          */\r
 519         private static TreeSet<Range> parseRanges(Enum resultType, String lines) {\r
 520                 TreeSet<Range> ranges = new TreeSet<Range>();\r
 521 \r
 522                 Scanner scan = new Scanner(lines);\r
 523 \r
 524                 assert scan.hasNext();\r
 525                 String del = scan.next();\r
 526                 assert "#".equals(del); // pass delimiter #\r
 527                 String type = scan.next(); // pass enum name e.g. COILS\r
 528                 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "\r
 529                                 + resultType.toString();\r
 530 \r
 531                 // beginning of the ranges\r
 532                 scan.useDelimiter(",");\r
 533                 while (scan.hasNext()) {\r
 534                         String range = scan.next();\r
 535                         if (!Util.isEmpty(range)) {\r
 536                                 ranges.add(new Range(range.split("-")));\r
 537                         }\r
 538                 }\r
 539                 return ranges;\r
 540         }\r
 541 \r
 542         /**\r
 543          * \r
 544          > Foobar_dundeefriends\r
 545          * \r
 546          * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343\r
 547          * \r
 548          * # REM465 355-368\r
 549          * \r
 550          * # HOTLOOPS 190-204\r
 551          * \r
 552          * # RESIDUE COILS REM465 HOTLOOPS\r
 553          * \r
 554          * M 0.86010 0.88512 0.37094\r
 555          * \r
 556          * T 0.79983 0.85864 0.44331\r
 557          * \r
 558          * >Next Sequence name\r
 559          * \r
 560          * \r
 561          * @param input\r
 562          * @return\r
 563          * @throws IOException\r
 564          * @throws UnknownFileFormatException\r
 565          */\r
 566         public static HashMap<String, Set<Score>> readGlobPlot(\r
 567                         final InputStream input) throws IOException,\r
 568                         UnknownFileFormatException {\r
 569                 Scanner scan = new Scanner(input);\r
 570                 scan.useDelimiter(">");\r
 571                 if (!scan.hasNext()) {\r
 572                         throw new UnknownFileFormatException(\r
 573                                         "In GlobPlot score format each sequence score is expected "\r
 574                                                         + "to start from the line: >Sequence name "\r
 575                                                         + " No such line was found!");\r
 576                 }\r
 577 \r
 578                 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();\r
 579                 int seqCounter = 0;\r
 580                 while (scan.hasNext()) {\r
 581                         seqCounter++;\r
 582                         String singleSeq = scan.next();\r
 583                         Scanner scansingle = new Scanner(singleSeq);\r
 584                         if (!scansingle.hasNextLine()) {\r
 585                                 throw new RuntimeException(\r
 586                                                 "The input looks like an incomplete GlobPlot file - cannot parse!");\r
 587                         }\r
 588 \r
 589                         StringBuffer seqbuffer = new StringBuffer();\r
 590                         ArrayList<Float> dydxScore = new ArrayList<Float>();\r
 591                         ArrayList<Float> rawScore = new ArrayList<Float>();\r
 592                         ArrayList<Float> smoothedScore = new ArrayList<Float>();\r
 593 \r
 594                         String sequenceName = scansingle.nextLine().trim();\r
 595                         TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,\r
 596                                         scansingle.nextLine());\r
 597                         TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,\r
 598                                         scansingle.nextLine());\r
 599 \r
 600                         String title = scansingle.nextLine();\r
 601                         assert title.startsWith("# RESIDUE      DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";\r
 602 \r
 603                         while (scansingle.hasNext()) {\r
 604                                 seqbuffer.append(scansingle.next());\r
 605                                 dydxScore.add(scansingle.nextFloat());\r
 606                                 rawScore.add(scansingle.nextFloat());\r
 607                                 smoothedScore.add(scansingle.nextFloat());\r
 608                         }\r
 609                         /*\r
 610                          * Also possible FastaSequence fs = new FastaSequence(sequenceName,\r
 611                          * seqbuffer.toString());\r
 612                          */\r
 613                         HashSet<Score> scores = new HashSet<Score>();\r
 614                         scores.add(new Score(GlobProtResult.Disorder, disorderR));\r
 615                         scores.add(new Score(GlobProtResult.GlobDoms, domsR));\r
 616                         scores.add(new Score(GlobProtResult.Dydx, dydxScore));\r
 617                         scores.add(new Score(GlobProtResult.RawScore, rawScore));\r
 618                         scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));\r
 619                         results.put(sequenceName, scores);\r
 620 \r
 621                         scansingle.close();\r
 622                 }\r
 623                 scan.close();\r
 624                 input.close();\r
 625                 return results;\r
 626         }\r
 627         /**\r
 628          * Read AACon result with no alignment files. This method leaves incoming\r
 629          * InputStream open!\r
 630          * \r
 631          * @param results\r
 632          *            output file of AAConservation\r
 633          * @return Map with keys {@link ConservationMethod} -> float[]\r
 634          */\r
 635         public static HashSet<Score> readAAConResults(InputStream results) {\r
 636                 if (results == null) {\r
 637                         throw new NullPointerException(\r
 638                                         "InputStream with results must be provided");\r
 639                 }\r
 640                 HashSet<Score> annotations = new HashSet<Score>();\r
 641                 Scanner sc = new Scanner(results);\r
 642                 sc.useDelimiter("#");\r
 643                 while (sc.hasNext()) {\r
 644                         String line = sc.next();\r
 645                         int spacePos = line.indexOf(" ");\r
 646                         assert spacePos > 0 : "Space is expected as delimited between method "\r
 647                                         + "name and values!";\r
 648                         String methodLine = line.substring(0, spacePos);\r
 649                         ConservationMethod method = ConservationMethod\r
 650                                         .getMethod(methodLine);\r
 651                         assert method != null : "Method " + methodLine\r
 652                                         + " is not recognized! ";\r
 653                         Scanner valuesScanner = new Scanner(line.substring(spacePos));\r
 654                         ArrayList<Float> values = new ArrayList<Float>();\r
 655                         while (valuesScanner.hasNextDouble()) {\r
 656                                 Double value = valuesScanner.nextDouble();\r
 657                                 values.add(value.floatValue());\r
 658                         }\r
 659                         annotations.add(new Score(method, values));\r
 660                 }\r
 661                 return annotations;\r
 662         }\r
 663 \r
 664         /**\r
 665          * Reads and parses Fasta or Clustal formatted file into a list of\r
 666          * FastaSequence objects\r
 667          * \r
 668          * @param inFilePath\r
 669          *            the path to the input file\r
 670          * @throws IOException\r
 671          *             if the file denoted by inFilePath cannot be read\r
 672          * @throws UnknownFileFormatException\r
 673          *             if the inFilePath points to the file which format cannot be\r
 674          *             recognised\r
 675          * @return the List of FastaSequence objects\r
 676          * \r
 677          */\r
 678         public static List<FastaSequence> openInputStream(String inFilePath)\r
 679                         throws IOException, UnknownFileFormatException {\r
 680 \r
 681                 // This stream gets closed in isValidClustalFile method\r
 682                 InputStream inStrForValidation = new FileInputStream(inFilePath);\r
 683                 // This stream is closed in the calling methods\r
 684                 InputStream inStr = new FileInputStream(inFilePath);\r
 685                 List<FastaSequence> fastaSeqs = null;\r
 686                 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {\r
 687                         Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);\r
 688                         // alignment cannot be null see\r
 689                         // ClustalAlignmentUtil.readClustalFile(inStr);\r
 690                         fastaSeqs = al.getSequences();\r
 691                 } else {\r
 692                         fastaSeqs = SequenceUtil.readFasta(inStr);\r
 693                 }\r
 694                 return fastaSeqs;\r
 695         }\r
 696 \r
 697 }\r
 698 \r
 699 enum DisemblResult {\r
 700         /** These contains ranges and scores */\r
 701         COILS, REM465, HOTLOOPS\r
 702 }\r
 703 enum GlobProtResult {\r
 704         /** This a range with no scores */\r
 705         GlobDoms,\r
 706         /** This a range with no scores */\r
 707         Disorder,\r
 708         /** This a score with no range */\r
 709         Dydx,\r
 710         /** This a score with no range */\r
 711         SmoothedScore,\r
 712         /** This a score with no range */\r
 713         RawScore\r
 714 }