X-Git-Url: http://source.jalview.org/gitweb/?a=blobdiff_plain;f=datamodel%2Fcompbio%2Fdata%2Fsequence%2FSequenceUtil.java;h=16b65f241c43b2b7a31fcd55abfe76ee977609c1;hb=96ab3a201572c3ed7f9a27682bd0b6ca7edb34c7;hp=f061ac79537345e462ebe299ab0f328273ec2f40;hpb=e98ca8829b7c42c00f2f2588e1168d3f31eb248f;p=jabaws.git diff --git a/datamodel/compbio/data/sequence/SequenceUtil.java b/datamodel/compbio/data/sequence/SequenceUtil.java index f061ac7..16b65f2 100644 --- a/datamodel/compbio/data/sequence/SequenceUtil.java +++ b/datamodel/compbio/data/sequence/SequenceUtil.java @@ -1,15 +1,19 @@ -/* - * @(#)SequenceUtil.java 1.0 September 2009 Copyright (c) 2009 Peter Troshin - * Jalview Web Services version: 2.0 This library is free software; you can - * redistribute it and/or modify it under the terms of the Apache License - * version 2 as published by the Apache Software Foundation This library is - * distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; - * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A - * PARTICULAR PURPOSE. See the Apache License for more details. A copy of the - * license is in apache_license.txt. It is also available here: see: - * http://www.apache.org/licenses/LICENSE-2.0.txt Any republication or derived - * work distributed in source code form must include this copyright and license - * notice. +/* Copyright (c) 2011 Peter Troshin + * + * JAva Bioinformatics Analysis Web Services (JABAWS) @version: 2.0 + * + * This library is free software; you can redistribute it and/or modify it under the terms of the + * Apache License version 2 as published by the Apache Software Foundation + * + * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without + * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Apache + * License for more details. + * + * A copy of the license is in apache_license.txt. It is also available here: + * @see: http://www.apache.org/licenses/LICENSE-2.0.txt + * + * Any republication or derived work distributed in source code form + * must include this copyright and license notice. */ package compbio.data.sequence; @@ -25,18 +29,25 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Scanner; +import java.util.Set; +import java.util.TreeSet; import java.util.logging.Level; import java.util.regex.Matcher; import java.util.regex.Pattern; +import compbio.util.Util; + /** * Utility class for operations on sequences * - * @author Petr Troshin - * @version 1.0 + * @author Peter Troshin + * @since 1.0 + * @version 2.0 June 2011 */ public final class SequenceUtil { @@ -93,19 +104,6 @@ public final class SequenceUtil { private SequenceUtil() { } // utility class, no instantiation - /* - * public static void write_PirSeq(OutputStream os, FastaSequence seq) - * throws IOException { BufferedWriter pir_out = new BufferedWriter(new - * OutputStreamWriter(os)); pir_out.write(">P1;" + seq.getId() + - * SysPrefs.newlinechar); pir_out.write(seq.getSequence() + - * SysPrefs.newlinechar); pir_out.close(); } public static void - * write_FastaSeq(OutputStream os, FastaSequence seq) throws IOException { - * BufferedWriter fasta_out = new BufferedWriter( new - * OutputStreamWriter(os)); fasta_out.write(">" + seq.getId() + - * SysPrefs.newlinechar); fasta_out.write(seq.getSequence() + - * SysPrefs.newlinechar); fasta_out.close(); } - */ - /** * @return true is the sequence contains only letters a,c, t, g, u */ @@ -164,6 +162,17 @@ public final class SequenceUtil { } /** + * Remove all non AA chars from the sequence + * + * @param sequence + * the sequence to clean + * @return cleaned sequence + */ + public static String cleanProteinSequence(String sequence) { + return SequenceUtil.NON_AA.matcher(sequence).replaceAll(""); + } + + /** * @param sequence * @return true is the sequence is a protein sequence, false overwise */ @@ -251,28 +260,11 @@ public final class SequenceUtil { public static List readFasta(final InputStream inStream) throws IOException { final List seqs = new ArrayList(); - - final BufferedReader infasta = new BufferedReader( - new InputStreamReader(inStream, "UTF8"), 16000); - final Pattern pattern = Pattern.compile("//s+"); - - String line; - String sname = "", seqstr = null; - do { - line = infasta.readLine(); - if ((line == null) || line.startsWith(">")) { - if (seqstr != null) { - seqs.add(new FastaSequence(sname.substring(1), seqstr)); - } - sname = line; // remove > - seqstr = ""; - } else { - final String subseq = pattern.matcher(line).replaceAll(""); - seqstr += subseq; - } - } while (line != null); - - infasta.close(); + FastaReader reader = new FastaReader(inStream); + while (reader.hasNext()) { + seqs.add(reader.next()); + } + inStream.close(); return seqs; } @@ -294,10 +286,131 @@ public final class SequenceUtil { outWriter.close(); } - public static List readJRonn(final File result) + /** + * Read IUPred output + * + * @param result + * @return + * @throws IOException + * @throws UnknownFileFormatException + */ + public static Map readIUPred(final File result) throws IOException, UnknownFileFormatException { InputStream input = new FileInputStream(result); - List sequences = readJRonn(input); + Map sequences = readIUPred(input, + IUPredResult.getType(result)); + input.close(); + return sequences; + } + + // Check the type of the file e.g. long| short or domain + // and read + /** + * ## Long Disorder + * + * # P53_HUMAN + * + * 1 M 0.9943 + * + * 2 E 0.9917 + * + * 3 E 0.9879 + * + * (every line) + * + * @throws IOException + * @throws UnknownFileFormatException + * + * + */ + private static Map readIUPred(InputStream input, + IUPredResult type) throws IOException, UnknownFileFormatException { + + Score score = null; + final Map seqs = new HashMap(); + Scanner scan = new Scanner(input); + scan.useDelimiter("#"); + while (scan.hasNext()) { + String nextEntry = scan.next(); + Scanner entry = new Scanner(nextEntry); + String name = entry.nextLine().trim(); + // inside entry: + if (IUPredResult.Glob == type) { + // parse domains + TreeSet ranges = parseIUPredDomains(entry); + score = new Score(type, ranges); + } else { + // parse short | long + float[] scores = parseIUPredScores(entry); + score = new Score(type, scores); + } + entry.close(); + seqs.put(name, score); + } + + scan.close(); + return seqs; + } + + /** + * # P53_HUMA + * + * Number of globular domains: 2 + * + * globular domain 1. 98 - 269 + * + * globular domain 2. 431 - 482 + * + * >P53_HUMA + * + * meepqsdpsv epplsqetfs dlwkllpenn vlsplpsqam ddlmlspddi eqwftedpgp + * + * @param scan + */ + private static TreeSet parseIUPredDomains(Scanner scan) { + String header = "Number of globular domains:"; + String domainPref = "globular domain"; + TreeSet ranges = new TreeSet(); + String line = scan.nextLine().trim(); + assert line.startsWith(header); + line = line.substring(header.length()).trim(); + int domainNum = Integer.parseInt(line); + if (domainNum == 0) { + return ranges; + } + + for (int i = 0; i < domainNum; i++) { + assert scan.hasNextLine(); + line = scan.nextLine(); + assert line.trim().startsWith(domainPref); + line = line.substring(line.indexOf(".") + 1).trim(); + Range r = new Range(line.split("-")); + ranges.add(r); + } + + return ranges; + } + /* + * 1 M 0.9943 + * + * 2 E 0.9917 + */ + private static float[] parseIUPredScores(Scanner scan) + throws UnknownFileFormatException { + List annotation = new ArrayList(); + while (scan.hasNextLine()) { + String line = scan.nextLine().trim(); + String[] val = line.split("\\s+"); + annotation.add(val[2]); + } + return convertToNumber(annotation + .toArray(new String[annotation.size()])); + } + + public static Map readJRonn(final File result) + throws IOException, UnknownFileFormatException { + InputStream input = new FileInputStream(result); + Map sequences = readJRonn(input); input.close(); return sequences; } @@ -321,9 +434,9 @@ public final class SequenceUtil { * is thrown if the inStream represents an unknown source of * data, i.e. not a JRonn output */ - public static List readJRonn(final InputStream inStream) + public static Map readJRonn(final InputStream inStream) throws IOException, UnknownFileFormatException { - final List seqs = new ArrayList(); + final Map seqs = new HashMap(); final BufferedReader infasta = new BufferedReader( new InputStreamReader(inStream, "UTF8"), 16000); @@ -351,7 +464,7 @@ public final class SequenceUtil { "File does not look like Jronn horizontally formatted output file!\n" + JRONN_WRONG_FORMAT_MESSAGE); } - seqs.add(new AnnotatedSequence(sname, sequence, annotation)); + seqs.put(sname, new Score(DisorderMethod.JRonn, annotation)); } } while (line != null); @@ -402,68 +515,214 @@ public final class SequenceUtil { /** * - * TODO complete! + > Foobar_dundeefriends + * + * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343 + * + * # REM465 355-368 + * + * # HOTLOOPS 190-204 + * + * # RESIDUE COILS REM465 HOTLOOPS + * + * M 0.86010 0.88512 0.37094 + * + * T 0.79983 0.85864 0.44331 + * + * >Next Sequence name * - * # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 0.37094 T 0.79983 - * 0.85864 0.44331 .... # RESIDUE COILS REM465 HOTLOOPS M 0.86010 0.88512 - * 0.37094 * * @param input * @return * @throws IOException * @throws UnknownFileFormatException */ - static List> readDisembl( + public static HashMap> readDisembl( final InputStream input) throws IOException, UnknownFileFormatException { Scanner scan = new Scanner(input); - scan.useDelimiter("# RESIDUE COILS REM465 HOTLOOPS\n"); + scan.useDelimiter(">"); if (!scan.hasNext()) { throw new UnknownFileFormatException( - "In Disembl score format each seqeunce score is expected to start from the line: " - + "'# RESIDUE COILS REM465 HOTLOOPS\\n'." + "In Disembl score format each sequence score is expected " + + "to start from the line: >Sequence name " + " No such line was found!"); } - List> results = new ArrayList>(); + HashMap> results = new HashMap>(); int seqCounter = 0; while (scan.hasNext()) { seqCounter++; String singleSeq = scan.next(); Scanner scansingle = new Scanner(singleSeq); + if (!scansingle.hasNextLine()) { + throw new RuntimeException( + "The input looks like an incomplete disembl file - cannot parse!"); + } + StringBuffer seqbuffer = new StringBuffer(); ArrayList coils = new ArrayList(); ArrayList rem = new ArrayList(); ArrayList hotloops = new ArrayList(); - MultiAnnotatedSequence disemblRes = new MultiAnnotatedSequence( - DisemblResultAnnot.class); - - while (scansingle.hasNextLine()) { - String valueLine = scansingle.nextLine(); - Scanner values = new Scanner(valueLine); - seqbuffer.append(values.next()); - coils.add(values.nextFloat()); - rem.add(values.nextFloat()); - hotloops.add(values.nextFloat()); - values.close(); + String sequenceName = scansingle.nextLine().trim(); + TreeSet coilsR = parseRanges(DisemblResult.COILS, + scansingle.nextLine()); + TreeSet rem465R = parseRanges(DisemblResult.REM465, + scansingle.nextLine()); + TreeSet loopsR = parseRanges(DisemblResult.HOTLOOPS, + scansingle.nextLine()); + + String title = scansingle.nextLine(); + assert title.startsWith("# RESIDUE COILS REM465 HOTLOOPS") : ">Sequence_name must follow column title: # RESIDUE COILS REM465 HOTLOOPS!"; + + while (scansingle.hasNext()) { + seqbuffer.append(scansingle.next()); + coils.add(scansingle.nextFloat()); + rem.add(scansingle.nextFloat()); + hotloops.add(scansingle.nextFloat()); } - disemblRes.addAnnotation(DisemblResultAnnot.COILS, coils); - disemblRes.addAnnotation(DisemblResultAnnot.REM465, rem); - disemblRes.addAnnotation(DisemblResultAnnot.HOTLOOPS, hotloops); - // TODO - // disemblRes.sequence = seqbuffer.toString(); + /* + * Also possible FastaSequence fs = new FastaSequence(sequenceName, + * seqbuffer.toString()); + */ + HashSet scores = new HashSet(); + scores.add(new Score(DisemblResult.COILS, coils, coilsR)); + scores.add(new Score(DisemblResult.HOTLOOPS, hotloops, rem465R)); + scores.add(new Score(DisemblResult.REM465, rem, loopsR)); + results.put(sequenceName, scores); + scansingle.close(); - results.add(disemblRes); } - + scan.close(); input.close(); return results; } /** + * Parsing: + * + * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343, + * 350-391, 429-485, 497-506, 539-547 + * + * # REM465 355-368 + * + * # HOTLOOPS 190-204 + * + * @param lines + * @return + */ + private static TreeSet parseRanges(Enum resultType, String lines) { + TreeSet ranges = new TreeSet(); + + Scanner scan = new Scanner(lines); + + assert scan.hasNext(); + String del = scan.next(); + assert "#".equals(del); // pass delimiter # + String type = scan.next(); // pass enum name e.g. COILS + assert resultType.toString().equalsIgnoreCase(type) : "Unknown result type: " + + resultType.toString(); + + // beginning of the ranges + scan.useDelimiter(","); + while (scan.hasNext()) { + String range = scan.next(); + if (!Util.isEmpty(range)) { + ranges.add(new Range(range.split("-"))); + } + } + return ranges; + } + + /** + * + > Foobar_dundeefriends + * + * # COILS 34-41, 50-58, 83-91, 118-127, 160-169, 191-220, 243-252, 287-343 + * + * # REM465 355-368 + * + * # HOTLOOPS 190-204 + * + * # RESIDUE COILS REM465 HOTLOOPS + * + * M 0.86010 0.88512 0.37094 + * + * T 0.79983 0.85864 0.44331 + * + * >Next Sequence name + * + * + * @param input + * @return + * @throws IOException + * @throws UnknownFileFormatException + */ + public static HashMap> readGlobPlot( + final InputStream input) throws IOException, + UnknownFileFormatException { + Scanner scan = new Scanner(input); + scan.useDelimiter(">"); + if (!scan.hasNext()) { + throw new UnknownFileFormatException( + "In GlobPlot score format each sequence score is expected " + + "to start from the line: >Sequence name " + + " No such line was found!"); + } + + HashMap> results = new HashMap>(); + int seqCounter = 0; + while (scan.hasNext()) { + seqCounter++; + String singleSeq = scan.next(); + Scanner scansingle = new Scanner(singleSeq); + if (!scansingle.hasNextLine()) { + throw new RuntimeException( + "The input looks like an incomplete GlobPlot file - cannot parse!"); + } + + StringBuffer seqbuffer = new StringBuffer(); + ArrayList dydxScore = new ArrayList(); + ArrayList rawScore = new ArrayList(); + ArrayList smoothedScore = new ArrayList(); + + String sequenceName = scansingle.nextLine().trim(); + TreeSet domsR = parseRanges(GlobProtResult.GlobDoms, + scansingle.nextLine()); + TreeSet disorderR = parseRanges(GlobProtResult.Disorder, + scansingle.nextLine()); + + String title = scansingle.nextLine(); + assert title.startsWith("# RESIDUE DYDX") : ">Sequence_name must follow column title: # RESIDUE DYDX RAW SMOOTHED!"; + + while (scansingle.hasNext()) { + seqbuffer.append(scansingle.next()); + dydxScore.add(scansingle.nextFloat()); + rawScore.add(scansingle.nextFloat()); + smoothedScore.add(scansingle.nextFloat()); + } + /* + * Also possible FastaSequence fs = new FastaSequence(sequenceName, + * seqbuffer.toString()); + */ + Set scores = new TreeSet(); + scores.add(new Score(GlobProtResult.Disorder, disorderR)); + scores.add(new Score(GlobProtResult.GlobDoms, domsR)); + scores.add(new Score(GlobProtResult.Dydx, dydxScore)); + scores.add(new Score(GlobProtResult.RawScore, rawScore)); + scores.add(new Score(GlobProtResult.SmoothedScore, smoothedScore)); + results.put(sequenceName, scores); + + scansingle.close(); + } + scan.close(); + input.close(); + return results; + } + /** * Read AACon result with no alignment files. This method leaves incoming - * the InputStream results open! + * InputStream open! * * @param results * output file of AAConservation @@ -532,3 +791,53 @@ public final class SequenceUtil { } } + +enum DisemblResult { + /** These contains ranges and scores */ + COILS, REM465, HOTLOOPS +} +enum GlobProtResult { + /** This a range with no scores */ + GlobDoms, + /** This a range with no scores */ + Disorder, + /** This a score with no range */ + Dydx, + /** This a score with no range */ + SmoothedScore, + /** This a score with no range */ + RawScore +} + +enum IUPredResult { + /** + * Short disorder + */ + Short, + /** + * Long disorder + */ + Long, + /** + * Globular domains + */ + Glob; + + static IUPredResult getType(File file) { + assert file != null; + String name = file.getName(); + if (name.endsWith(Long.toString().toLowerCase())) { + return Long; + } + if (name.endsWith(Short.toString().toLowerCase())) { + return Short; + } + if (name.endsWith(Glob.toString().toLowerCase())) { + return Glob; + } + throw new AssertionError( + "IUPred result file type cannot be recognised! " + + "\nFile must ends with one of [glob, long or short]" + + "\n but given file name was: " + file.getName()); + } +} \ No newline at end of file