1 /* Copyright (c) 2011 Peter Troshin
\r
2 * Copyright (c) 2013 Alexander Sherstnev
\r
4 * JAva Bioinformatics Analysis Web Services (JABAWS)
\r
7 * This library is free software; you can redistribute it and/or modify it under the terms of the
\r
8 * Apache License version 2 as published by the Apache Software Foundation
\r
10 * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
\r
11 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache
\r
12 * License for more details.
\r
14 * A copy of the license is in apache_license.txt. It is also available here:
\r
15 * @see: http://www.apache.org/licenses/LICENSE-2.0.txt
\r
17 * Any republication or derived work distributed in source code form
\r
18 * must include this copyright and license notice.
\r
21 package compbio.data.sequence;
\r
23 import java.io.BufferedReader;
\r
24 import java.io.BufferedWriter;
\r
25 import java.io.Closeable;
\r
26 import java.io.File;
\r
27 import java.io.FileInputStream;
\r
28 import java.io.FileNotFoundException;
\r
29 import java.io.IOException;
\r
30 import java.io.InputStream;
\r
31 import java.io.InputStreamReader;
\r
32 import java.io.OutputStream;
\r
33 import java.io.OutputStreamWriter;
\r
34 import java.util.ArrayList;
\r
35 import java.util.Arrays;
\r
36 import java.util.HashMap;
\r
37 import java.util.Collections;
\r
38 import java.util.HashSet;
\r
39 import java.util.List;
\r
40 import java.util.Map;
\r
41 import java.util.Scanner;
\r
42 import java.util.Set;
\r
43 import java.util.TreeMap;
\r
44 import java.util.TreeSet;
\r
45 import java.util.logging.Level;
\r
46 import java.util.regex.Matcher;
\r
47 import java.util.regex.Pattern;
\r
49 import compbio.util.Util;
\r
52 * Utility class for operations on sequences
\r
54 * @author Peter Troshin
\r
56 * @version 2.0 June 2011
\r
58 public final class SequenceUtil {
\r
61 * A whitespace character: [\t\n\x0B\f\r]
\r
63 public static final Pattern WHITE_SPACE = Pattern.compile("\\s");
\r
68 public static final Pattern DIGIT = Pattern.compile("\\d");
\r
73 public static final Pattern NONWORD = Pattern.compile("\\W");
\r
78 public static final Pattern AA = Pattern.compile("[ARNDCQEGHILKMFPSTWYV]+",
\r
79 Pattern.CASE_INSENSITIVE);
\r
82 * inversion of AA pattern
\r
84 public static final Pattern NON_AA = Pattern.compile(
\r
85 "[^ARNDCQEGHILKMFPSTWYV]+", Pattern.CASE_INSENSITIVE);
\r
88 * Same as AA pattern but with two additional letters - XU
\r
90 public static final Pattern AMBIGUOUS_AA = Pattern.compile(
\r
91 "[ARNDCQEGHILKMFPSTWYVXU]+", Pattern.CASE_INSENSITIVE);
\r
94 * Nucleotides a, t, g, c, u
\r
96 public static final Pattern NUCLEOTIDE = Pattern.compile("[AGTCU]+",
\r
97 Pattern.CASE_INSENSITIVE);
\r
100 * Ambiguous nucleotide
\r
102 public static final Pattern AMBIGUOUS_NUCLEOTIDE = Pattern.compile(
\r
103 "[AGTCRYMKSWHBVDNU]+", Pattern.CASE_INSENSITIVE); // see IUPAC
\r
107 public static final Pattern NON_NUCLEOTIDE = Pattern.compile("[^AGTCU]+",
\r
108 Pattern.CASE_INSENSITIVE);
\r
110 private SequenceUtil() {
\r
111 } // utility class, no instantiation
\r
114 * @return true is the sequence contains only letters a,c, t, g, u
\r
116 public static boolean isNucleotideSequence(final FastaSequence s) {
\r
117 return SequenceUtil.isNonAmbNucleotideSequence(s.getSequence());
\r
121 * Ambiguous DNA chars : AGTCRYMKSWHBVDN // differs from protein in only one
\r
124 public static boolean isNonAmbNucleotideSequence(String sequence) {
\r
125 sequence = SequenceUtil.cleanSequence(sequence);
\r
126 if (SequenceUtil.DIGIT.matcher(sequence).find()) {
\r
129 if (SequenceUtil.NON_NUCLEOTIDE.matcher(sequence).find()) {
\r
132 * System.out.format("I found the text starting at " +
\r
133 * "index %d and ending at index %d.%n", nonDNAmatcher .start(),
\r
134 * nonDNAmatcher.end());
\r
137 final Matcher DNAmatcher = SequenceUtil.NUCLEOTIDE.matcher(sequence);
\r
138 return DNAmatcher.find();
\r
142 * Removes all whitespace chars in the sequence string
\r
145 * @return cleaned up sequence
\r
147 public static String cleanSequence(String sequence) {
\r
148 assert sequence != null;
\r
149 final Matcher m = SequenceUtil.WHITE_SPACE.matcher(sequence);
\r
150 sequence = m.replaceAll("").toUpperCase();
\r
155 * Removes all special characters and digits as well as whitespace chars
\r
156 * from the sequence
\r
159 * @return cleaned up sequence
\r
161 public static String deepCleanSequence(String sequence) {
\r
162 sequence = SequenceUtil.cleanSequence(sequence);
\r
163 sequence = SequenceUtil.DIGIT.matcher(sequence).replaceAll("");
\r
164 sequence = SequenceUtil.NONWORD.matcher(sequence).replaceAll("");
\r
165 final Pattern othernonSeqChars = Pattern.compile("[_-]+");
\r
166 sequence = othernonSeqChars.matcher(sequence).replaceAll("");
\r
171 * Remove all non AA chars from the sequence
\r
174 * the sequence to clean
\r
175 * @return cleaned sequence
\r
177 public static String cleanProteinSequence(String sequence) {
\r
178 return SequenceUtil.NON_AA.matcher(sequence).replaceAll("");
\r
183 * @return true is the sequence is a protein sequence, false overwise
\r
185 public static boolean isProteinSequence(String sequence) {
\r
186 sequence = SequenceUtil.cleanSequence(sequence);
\r
187 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {
\r
190 if (SequenceUtil.DIGIT.matcher(sequence).find()) {
\r
193 if (SequenceUtil.NON_AA.matcher(sequence).find()) {
\r
196 final Matcher protmatcher = SequenceUtil.AA.matcher(sequence);
\r
197 return protmatcher.find();
\r
201 * Check whether the sequence confirms to amboguous protein sequence
\r
204 * @return return true only if the sequence if ambiguous protein sequence
\r
205 * Return false otherwise. e.g. if the sequence is non-ambiguous
\r
208 public static boolean isAmbiguosProtein(String sequence) {
\r
209 sequence = SequenceUtil.cleanSequence(sequence);
\r
210 if (SequenceUtil.isNonAmbNucleotideSequence(sequence)) {
\r
213 if (SequenceUtil.DIGIT.matcher(sequence).find()) {
\r
216 if (SequenceUtil.NON_AA.matcher(sequence).find()) {
\r
219 if (SequenceUtil.AA.matcher(sequence).find()) {
\r
222 final Matcher amb_prot = SequenceUtil.AMBIGUOUS_AA.matcher(sequence);
\r
223 return amb_prot.find();
\r
227 * Writes list of FastaSequeces into the outstream formatting the sequence
\r
228 * so that it contains width chars on each line
\r
233 * - the maximum number of characters to write in one line
\r
234 * @throws IOException
\r
236 public static void writeFasta(final OutputStream outstream,
\r
237 final List<FastaSequence> sequences, final int width)
\r
238 throws IOException {
\r
239 writeFastaKeepTheStream(outstream, sequences, width);
\r
243 public static void writeFastaKeepTheStream(final OutputStream outstream,
\r
244 final List<FastaSequence> sequences, final int width)
\r
245 throws IOException {
\r
246 final OutputStreamWriter writer = new OutputStreamWriter(outstream);
\r
247 final BufferedWriter fastawriter = new BufferedWriter(writer);
\r
248 for (final FastaSequence fs : sequences) {
\r
249 fastawriter.write(">" + fs.getId() + "\n");
\r
250 fastawriter.write(fs.getFormatedSequence(width));
\r
251 fastawriter.write("\n");
\r
253 fastawriter.flush();
\r
258 * Reads fasta sequences from inStream into the list of FastaSequence
\r
263 * @return list of FastaSequence objects
\r
264 * @throws IOException
\r
266 public static List<FastaSequence> readFasta(final InputStream inStream)
\r
267 throws IOException {
\r
268 final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
\r
269 FastaReader reader = new FastaReader(inStream);
\r
270 while (reader.hasNext()) {
\r
271 seqs.add(reader.next());
\r
278 * Writes FastaSequence in the file, each sequence will take one line only
\r
282 * @throws IOException
\r
284 public static void writeFasta(final OutputStream os,
\r
285 final List<FastaSequence> sequences) throws IOException {
\r
286 final OutputStreamWriter outWriter = new OutputStreamWriter(os);
\r
287 final BufferedWriter fasta_out = new BufferedWriter(outWriter);
\r
288 for (final FastaSequence fs : sequences) {
\r
289 fasta_out.write(fs.getOnelineFasta());
\r
296 public static final List<FastaSequence> readJpredFile(InputStream result)
\r
297 throws IOException, FileNotFoundException,NullPointerException {
\r
298 return readFasta (result);
\r
302 * Read IUPred output
\r
305 * @return Map key->sequence name, value->Score
\r
306 * @throws IOException
\r
307 * @throws UnknownFileFormatException
\r
309 public static Map<String, Score> readIUPred(final File result)
\r
310 throws IOException, UnknownFileFormatException {
\r
311 InputStream input = new FileInputStream(result);
\r
312 Map<String, Score> sequences = readIUPred(input,
\r
313 IUPredResult.getType(result));
\r
318 // Check the type of the file e.g. long| short or domain
\r
333 * @throws IOException
\r
334 * @throws UnknownFileFormatException
\r
338 private static Map<String, Score> readIUPred(InputStream input,
\r
339 IUPredResult type) throws IOException, UnknownFileFormatException {
\r
341 Score score = null;
\r
342 final Map<String, Score> seqs = new HashMap<String, Score>();
\r
343 Scanner scan = new Scanner(input);
\r
344 scan.useDelimiter("#");
\r
345 while (scan.hasNext()) {
\r
346 String nextEntry = scan.next();
\r
347 Scanner entry = new Scanner(nextEntry);
\r
348 String name = entry.nextLine().trim();
\r
350 if (IUPredResult.Glob == type) {
\r
352 TreeSet<Range> ranges = parseIUPredDomains(entry);
\r
353 score = new Score(type, ranges);
\r
355 // parse short | long
\r
356 float[] scores = parseIUPredScores(entry);
\r
357 score = new Score(type, scores);
\r
360 seqs.put(name, score);
\r
370 * Number of globular domains: 2
\r
372 * globular domain 1. 98 - 269
\r
374 * globular domain 2. 431 - 482
\r
378 * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp
\r
382 private static TreeSet<Range> parseIUPredDomains(Scanner scan) {
\r
383 String header = "Number of globular domains:";
\r
384 String domainPref = "globular domain";
\r
385 TreeSet<Range> ranges = new TreeSet<Range>();
\r
386 String line = scan.nextLine().trim();
\r
387 assert line.startsWith(header);
\r
388 line = line.substring(header.length()).trim();
\r
389 int domainNum = Integer.parseInt(line);
\r
390 if (domainNum == 0) {
\r
394 for (int i = 0; i < domainNum; i++) {
\r
395 assert scan.hasNextLine();
\r
396 line = scan.nextLine();
\r
397 assert line.trim().startsWith(domainPref);
\r
398 line = line.substring(line.indexOf(".") + 1).trim();
\r
399 Range r = new Range(line.split("-"));
\r
410 private static float[] parseIUPredScores(Scanner scan)
\r
411 throws UnknownFileFormatException {
\r
412 List<String> annotation = new ArrayList<String>();
\r
413 while (scan.hasNextLine()) {
\r
414 String line = scan.nextLine().trim();
\r
415 String[] val = line.split("\\s+");
\r
416 annotation.add(val[2]);
\r
418 return convertToNumber(annotation
\r
419 .toArray(new String[annotation.size()]));
\r
422 public static Map<String, Score> readJRonn(final File result)
\r
423 throws IOException, UnknownFileFormatException {
\r
424 InputStream input = new FileInputStream(result);
\r
425 Map<String, Score> sequences = readJRonn(input);
\r
431 * Reader for JRonn horizontal file format
\r
434 * >Foobar M G D T T A G 0.48 0.42
\r
435 * 0.42 0.48 0.52 0.53 0.54
\r
438 * Where all values are tab delimited
\r
441 * the InputStream connected to the JRonn output file
\r
442 * @return Map key=sequence name value=Score
\r
443 * @throws IOException
\r
444 * is thrown if the inStream has problems accessing the data
\r
445 * @throws UnknownFileFormatException
\r
446 * is thrown if the inStream represents an unknown source of
\r
447 * data, i.e. not a JRonn output
\r
449 public static Map<String, Score> readJRonn(final InputStream inStream)
\r
450 throws IOException, UnknownFileFormatException {
\r
451 final Map<String, Score> seqs = new HashMap<String, Score>();
\r
453 final BufferedReader infasta = new BufferedReader(
\r
454 new InputStreamReader(inStream, "UTF8"), 16000);
\r
459 line = infasta.readLine();
\r
460 if (line == null || line.isEmpty()) {
\r
461 // skip empty lines
\r
464 if (line.startsWith(">")) {
\r
466 sname = line.trim().substring(1);
\r
467 // read sequence line
\r
468 line = infasta.readLine();
\r
469 final String sequence = line.replace("\t", "");
\r
470 // read annotation line
\r
471 line = infasta.readLine();
\r
472 String[] annotValues = line.split("\t");
\r
473 float[] annotation = convertToNumber(annotValues);
\r
474 if (annotation.length != sequence.length()) {
\r
475 throw new UnknownFileFormatException(
\r
476 "File does not look like Jronn horizontally formatted output file!\n"
\r
477 + JRONN_WRONG_FORMAT_MESSAGE);
\r
479 seqs.put(sname, new Score(DisorderMethod.JRonn, annotation));
\r
481 } while (line != null);
\r
487 private static float[] convertToNumber(String[] annotValues)
\r
488 throws UnknownFileFormatException {
\r
489 float[] annotation = new float[annotValues.length];
\r
491 for (int i = 0; i < annotation.length; i++) {
\r
492 annotation[i] = Float.parseFloat(annotValues[i]);
\r
494 } catch (NumberFormatException e) {
\r
495 throw new UnknownFileFormatException(JRONN_WRONG_FORMAT_MESSAGE,
\r
501 private static final String JRONN_WRONG_FORMAT_MESSAGE = "Jronn file must be in the following format:\n"
\r
502 + ">sequence_name\n "
\r
504 + "0.43 0.22 0.65\n"
\r
505 + "Where first line is the sequence name,\n"
\r
506 + "second line is the tab delimited sequence,\n"
\r
507 + "third line contains tab delimited disorder prediction values.\n"
\r
508 + "No lines are allowed between these three. Additionally, the number of "
\r
509 + "sequence residues must be equal to the number of the disorder values.";
\r
512 * Closes the Closable and logs the exception if any
\r
517 public final static void closeSilently(java.util.logging.Logger log,
\r
518 Closeable stream) {
\r
519 if (stream != null) {
\r
522 } catch (IOException e) {
\r
523 log.log(Level.WARNING, e.getLocalizedMessage(), e.getCause());
\r
530 > Foobar_dundeefriends
\r
532 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343
\r
536 * # HOTLOOPS 190-204
\r
538 * # RESIDUE COILS REM465 HOTLOOPS
\r
540 * M 0.86010 0.88512 0.37094
\r
542 * T 0.79983 0.85864 0.44331
\r
544 * >Next Sequence name
\r
549 * @return Map key=sequence name, value=set of score
\r
550 * @throws IOException
\r
551 * @throws UnknownFileFormatException
\r
553 public static HashMap<String, Set<Score>> readDisembl(
\r
554 final InputStream input) throws IOException,
\r
555 UnknownFileFormatException {
\r
556 Scanner scan = new Scanner(input);
\r
557 scan.useDelimiter(">");
\r
558 if (!scan.hasNext()) {
\r
559 throw new UnknownFileFormatException(
\r
560 "In Disembl score format each sequence score is expected "
\r
561 + "to start from the line: >Sequence name "
\r
562 + " No such line was found!");
\r
565 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();
\r
566 int seqCounter = 0;
\r
567 while (scan.hasNext()) {
\r
569 String singleSeq = scan.next();
\r
570 Scanner scansingle = new Scanner(singleSeq);
\r
571 if (!scansingle.hasNextLine()) {
\r
572 throw new RuntimeException(
\r
573 "The input looks like an incomplete disembl file - cannot parse!");
\r
576 StringBuffer seqbuffer = new StringBuffer();
\r
577 ArrayList<Float> coils = new ArrayList<Float>();
\r
578 ArrayList<Float> rem = new ArrayList<Float>();
\r
579 ArrayList<Float> hotloops = new ArrayList<Float>();
\r
581 String sequenceName = scansingle.nextLine().trim();
\r
582 TreeSet<Range> coilsR = parseRanges(DisemblResult.COILS,
\r
583 scansingle.nextLine());
\r
584 TreeSet<Range> rem465R = parseRanges(DisemblResult.REM465,
\r
585 scansingle.nextLine());
\r
586 TreeSet<Range> loopsR = parseRanges(DisemblResult.HOTLOOPS,
\r
587 scansingle.nextLine());
\r
589 String title = scansingle.nextLine();
\r
590 assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!";
\r
592 while (scansingle.hasNext()) {
\r
593 seqbuffer.append(scansingle.next());
\r
594 coils.add(scansingle.nextFloat());
\r
595 rem.add(scansingle.nextFloat());
\r
596 hotloops.add(scansingle.nextFloat());
\r
599 * Also possible FastaSequence fs = new FastaSequence(sequenceName,
\r
600 * seqbuffer.toString());
\r
602 HashSet<Score> scores = new HashSet<Score>();
\r
603 scores.add(new Score(DisemblResult.COILS, coils, coilsR));
\r
604 scores.add(new Score(DisemblResult.REM465, rem, rem465R));
\r
605 scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, loopsR));
\r
606 results.put(sequenceName, scores);
\r
608 scansingle.close();
\r
618 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343,
\r
619 * 350-391, 429-485, 497-506, 539-547
\r
623 * # HOTLOOPS 190-204
\r
628 private static TreeSet<Range> parseRanges(Enum resultType, String lines) {
\r
629 TreeSet<Range> ranges = new TreeSet<Range>();
\r
631 Scanner scan = new Scanner(lines);
\r
633 assert scan.hasNext();
\r
634 String del = scan.next();
\r
635 assert "#".equals(del); // pass delimiter #
\r
636 String type = scan.next(); // pass enum name e.g. COILS
\r
637 assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: "
\r
638 + resultType.toString();
\r
640 // beginning of the ranges
\r
641 scan.useDelimiter(",");
\r
642 while (scan.hasNext()) {
\r
643 String range = scan.next();
\r
644 if (!Util.isEmpty(range)) {
\r
645 ranges.add(new Range(range.split("-")));
\r
653 > Foobar_dundeefriends
\r
655 * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343
\r
659 * # HOTLOOPS 190-204
\r
661 * # RESIDUE COILS REM465 HOTLOOPS
\r
663 * M 0.86010 0.88512 0.37094
\r
665 * T 0.79983 0.85864 0.44331
\r
667 * >Next Sequence name
\r
671 * @return Map key=sequence name, value=set of score
\r
672 * @throws IOException
\r
673 * @throws UnknownFileFormatException
\r
675 public static HashMap<String, Set<Score>> readGlobPlot(
\r
676 final InputStream input) throws IOException,
\r
677 UnknownFileFormatException {
\r
678 Scanner scan = new Scanner(input);
\r
679 scan.useDelimiter(">");
\r
680 if (!scan.hasNext()) {
\r
681 throw new UnknownFileFormatException(
\r
682 "In GlobPlot score format each sequence score is expected "
\r
683 + "to start from the line: >Sequence name "
\r
684 + " No such line was found!");
\r
687 HashMap<String, Set<Score>> results = new HashMap<String, Set<Score>>();
\r
688 int seqCounter = 0;
\r
689 while (scan.hasNext()) {
\r
691 String singleSeq = scan.next();
\r
692 Scanner scansingle = new Scanner(singleSeq);
\r
693 if (!scansingle.hasNextLine()) {
\r
694 throw new RuntimeException(
\r
695 "The input looks like an incomplete GlobPlot file - cannot parse!");
\r
698 StringBuffer seqbuffer = new StringBuffer();
\r
699 ArrayList<Float> dydxScore = new ArrayList<Float>();
\r
700 ArrayList<Float> rawScore = new ArrayList<Float>();
\r
701 ArrayList<Float> smoothedScore = new ArrayList<Float>();
\r
703 String sequenceName = scansingle.nextLine().trim();
\r
704 TreeSet<Range> domsR = parseRanges(GlobProtResult.GlobDoms,
\r
705 scansingle.nextLine());
\r
706 TreeSet<Range> disorderR = parseRanges(GlobProtResult.Disorder,
\r
707 scansingle.nextLine());
\r
709 String title = scansingle.nextLine();
\r
710 assert title.startsWith("# RESIDUE DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!";
\r
712 while (scansingle.hasNext()) {
\r
713 seqbuffer.append(scansingle.next());
\r
714 dydxScore.add(scansingle.nextFloat());
\r
715 rawScore.add(scansingle.nextFloat());
\r
716 smoothedScore.add(scansingle.nextFloat());
\r
719 * Also possible FastaSequence fs = new FastaSequence(sequenceName,
\r
720 * seqbuffer.toString());
\r
722 Set<Score> scores = new TreeSet<Score>();
\r
723 scores.add(new Score(GlobProtResult.Disorder, disorderR));
\r
724 scores.add(new Score(GlobProtResult.GlobDoms, domsR));
\r
725 scores.add(new Score(GlobProtResult.Dydx, dydxScore));
\r
726 scores.add(new Score(GlobProtResult.RawScore, rawScore));
\r
727 scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore));
\r
728 results.put(sequenceName, scores);
\r
730 scansingle.close();
\r
737 * Read AACon result with no alignment files. This method leaves incoming
\r
738 * InputStream open!
\r
741 * output file of AAConservation
\r
742 * @return Map with keys {@link ConservationMethod} -> float[]
\r
744 public static HashSet<Score> readAAConResults(InputStream results) {
\r
745 if (results == null) {
\r
746 throw new NullPointerException(
\r
747 "InputStream with results must be provided");
\r
749 HashSet<Score> annotations = new HashSet<Score>();
\r
750 Scanner sc = new Scanner(results);
\r
751 sc.useDelimiter("#");
\r
752 while (sc.hasNext()) {
\r
753 String line = sc.next();
\r
754 int spacePos = line.indexOf(" ");
\r
755 assert spacePos > 0 : "Space is expected as delimited between method "
\r
756 + "name and values!";
\r
757 String methodLine = line.substring(0, spacePos);
\r
758 ConservationMethod method = ConservationMethod
\r
759 .getMethod(methodLine);
\r
760 assert method != null : "Method " + methodLine
\r
761 + " is not recognized! ";
\r
762 Scanner valuesScanner = new Scanner(line.substring(spacePos));
\r
763 ArrayList<Float> values = new ArrayList<Float>();
\r
764 while (valuesScanner.hasNextDouble()) {
\r
765 Double value = valuesScanner.nextDouble();
\r
766 values.add(value.floatValue());
\r
768 annotations.add(new Score(method, values));
\r
770 return annotations;
\r
777 * Reads and parses Fasta or Clustal formatted file into a list of
\r
778 * FastaSequence objects
\r
780 * @param inFilePath
\r
781 * the path to the input file
\r
782 * @throws IOException
\r
783 * if the file denoted by inFilePath cannot be read
\r
784 * @throws UnknownFileFormatException
\r
785 * if the inFilePath points to the file which format cannot be
\r
787 * @return the List of FastaSequence objects
\r
790 public static List<FastaSequence> openInputStream(String inFilePath)
\r
791 throws IOException, UnknownFileFormatException {
\r
793 // This stream gets closed in isValidClustalFile method
\r
794 InputStream inStrForValidation = new FileInputStream(inFilePath);
\r
795 // This stream is closed in the calling methods
\r
796 InputStream inStr = new FileInputStream(inFilePath);
\r
797 List<FastaSequence> fastaSeqs = null;
\r
798 if (ClustalAlignmentUtil.isValidClustalFile(inStrForValidation)) {
\r
799 Alignment al = ClustalAlignmentUtil.readClustalFile(inStr);
\r
800 // alignment cannot be null see
\r
801 // ClustalAlignmentUtil.readClustalFile(inStr);
\r
802 fastaSeqs = al.getSequences();
\r
804 fastaSeqs = SequenceUtil.readFasta(inStr);
\r
809 // This can't possibly be right for all cases!
\r
810 // but it will do for now
\r
812 // As for the metadata. This function doesnt know what program
\r
813 // generated it. How to handle the metadata!?
\r
815 public static void writeClustal(OutputStream outStream,
\r
816 List<FastaSequence> sequences, char gapChar)
\r
817 throws IOException {
\r
819 BufferedWriter writer = new BufferedWriter(
\r
820 new OutputStreamWriter(outStream));
\r
821 // will give AlignmentMetadata default type of CLUSTAL for now
\r
822 AlignmentMetadata al = new AlignmentMetadata(Program.CLUSTAL, gapChar);
\r
824 ClustalAlignmentUtil.writeClustalAlignment(writer,
\r
825 new Alignment(sequences, al));
\r
831 enum DisemblResult {
\r
832 /** These contains ranges and scores */
\r
833 COILS, REM465, HOTLOOPS
\r
835 enum GlobProtResult {
\r
836 /** This a range with no scores */
\r
838 /** This a range with no scores */
\r
840 /** This a score with no range */
\r
842 /** This a score with no range */
\r
844 /** This a score with no range */
\r
848 enum IUPredResult {
\r
862 static IUPredResult getType(File file) {
\r
863 assert file != null;
\r
864 String name = file.getName();
\r
865 if (name.endsWith(Long.toString().toLowerCase())) {
\r
868 if (name.endsWith(Short.toString().toLowerCase())) {
\r
871 if (name.endsWith(Glob.toString().toLowerCase())) {
\r
874 throw new AssertionError(
\r
875 "IUPred result file type cannot be recognised! "
\r
876 + "\nFile must ends with one of [glob, long or short]"
\r
877 + "\n but given file name was: " + file.getName());
\r