From: Jim Procter Date: Wed, 21 Mar 2018 17:16:07 +0000 (+0000) Subject: Merge branch 'develop' into features/JAL-2909_bamImport2_11 X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=746ac909272ce4b118e7b34d0402ac2cfca1d5ca;hp=bb7a2cab0b0daf1943760e95d64204a92d29e41a;p=jalview.git Merge branch 'develop' into features/JAL-2909_bamImport2_11 --- diff --git a/.classpath b/.classpath index 17523b9..f4b8cf8 100644 --- a/.classpath +++ b/.classpath @@ -66,7 +66,6 @@ - diff --git a/lib/htsjdk-1.133.jar b/lib/htsjdk-1.133.jar deleted file mode 100644 index f084258..0000000 Binary files a/lib/htsjdk-1.133.jar and /dev/null differ diff --git a/resources/lang/Messages.properties b/resources/lang/Messages.properties index 3f5aa94..703adc4 100644 --- a/resources/lang/Messages.properties +++ b/resources/lang/Messages.properties @@ -1367,3 +1367,7 @@ label.most_bound_molecules = Most Bound Molecules label.most_polymer_residues = Most Polymer Residues label.cached_structures = Cached Structures label.free_text_search = Free Text Search +label.bam_file_options = BAM File Options +label.bam_chromosome = Chromosome to load: +label.bam_range = Region to load: +warn.bam_params_not_set = .bam file region parameters have not been set diff --git a/src/jalview/datamodel/CigarBase.java b/src/jalview/datamodel/CigarBase.java index 4e7e03f..a44d74b 100644 --- a/src/jalview/datamodel/CigarBase.java +++ b/src/jalview/datamodel/CigarBase.java @@ -191,76 +191,6 @@ public abstract class CigarBase } /** - * turn a cigar string into a series of operation range pairs - * - * @param cigarString - * String - * @return object[] {char[] operation, int[] range} - * @throws java.lang.Exception - * for improperly formated cigar strings or ones with unknown - * operations - */ - public static Object[] parseCigarString(String cigarString) - throws Exception - { - int ops = 0; - for (int i = 0, l = cigarString.length(); i < l; i++) - { - char c = cigarString.charAt(i); - if (c == M || c == (M - _case_shift) || c == I - || c == (I - _case_shift) || c == D || c == (D - _case_shift)) - { - ops++; - } - } - char[] operation = new char[ops]; - int[] range = new int[ops]; - int op = 0; - int i = 0, l = cigarString.length(); - while (i < l) - { - char c; - int j = i; - do - { - c = cigarString.charAt(j++); - } while (c >= '0' && c <= '9' && j < l); - if (j >= l && c >= '0' && c <= '9') - { - throw new Exception(MessageManager - .getString("exception.unterminated_cigar_string")); - } - try - { - String rangeint = cigarString.substring(i, j - 1); - range[op] = Integer.parseInt(rangeint); - i = j; - } catch (Exception e) - { - throw new Error(MessageManager - .getString("error.implementation_bug_parse_cigar_string")); - } - if (c >= 'a' && c <= 'z') - { - c -= _case_shift; - } - if ((c == M || c == I || c == D)) - { - operation[op++] = c; - } - else - { - throw new Exception(MessageManager.formatMessage( - "exception.unexpected_operation_cigar_string_pos", - new String[] - { new StringBuffer(c).toString(), - Integer.valueOf(i).toString(), cigarString })); - } - } - return new Object[] { operation, range }; - } - - /** * add an operation to cigar string * * @param op diff --git a/src/jalview/datamodel/CigarParser.java b/src/jalview/datamodel/CigarParser.java new file mode 100644 index 0000000..a42c2b1 --- /dev/null +++ b/src/jalview/datamodel/CigarParser.java @@ -0,0 +1,438 @@ +package jalview.datamodel; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.SortedMap; +import java.util.TreeMap; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMRecord; + +public class CigarParser +{ + private final char gapChar; + + public CigarParser(char gap) + { + gapChar = gap; + } + + /** + * Apply the CIGAR string to a read sequence and return the updated read. + * + * @param rec + * the SAM record for the read + * @param insertions + * a map of inserted positions and lengths + * @param alignmentStart + * start of alignment to be used as offset when padding reads with + * gaps + * @param seq + * sequence to be annotated with inserts + * @return string representing read with gaps, clipping etc applied + */ + public String parseCigarToSequence(SAMRecord rec, + SortedMap insertions, int alignmentStart, + SequenceI seq) + { + StringBuilder newRead = new StringBuilder(); + Iterator it = rec.getCigar().getCigarElements() + .iterator(); + + int next = 0; + int refnext = 0; + + // pad with spaces before read + // first count gaps needed to pad to reference position + // NB position is 1-based so number of gaps = pos-1 + int gaps = rec.getStart() - alignmentStart; + + // now count gaps to pad for insertions in other reads + int insertCount = countInsertionsBeforeRead(rec, insertions); + addGaps(newRead, gaps + insertCount); + + int lastinserts = 0; + while (it.hasNext()) + { + CigarElement el = it.next(); + int length = el.getLength(); + + // which insertions coincide with current CIGAR operation? + SortedMap seqInserts = insertions.subMap( + rec.getStart() + refnext, rec.getStart() + refnext + length); + + int[] override = null; + if (lastinserts > 0) + { + if (!seqInserts.containsKey(rec.getStart() + refnext)) + { + // ERROR + int pos = rec.getStart() + refnext; + System.out.println( + "Read " + rec.getReadName() + " has an insertion at " + + pos + " which is not in seqInserts"); + } + else + { + // we just inserted something + // remove these inserts now - should be very first entry + int count = seqInserts.get(rec.getStart() + refnext); + override = new int[] { rec.getStart() + refnext, + count - lastinserts }; + lastinserts = 0; + } + } + + Iterator> iit = seqInserts.entrySet() + .iterator(); + + String nextSegment = applyCigarOp(el, next, rec, iit, override, seq); + newRead.append(nextSegment); + + if (el.getOperator().consumesReferenceBases()) + { + refnext += length; + } + if (el.getOperator().consumesReadBases()) + { + next += length; + } + if (el.getOperator().equals(CigarOperator.I)) + { + // count how many insertions we made + lastinserts = length; + } + } + return newRead.toString(); + } + + /** + * Apply a cigar operation to a segment of a read, accounting for any + * insertions to the reference from other reads in the region + * + * @param el + * the current CIGAR element + * @param next + * location in read to start applying CIGAR op + * @param length + * length of CIGAR op + * @param rec + * SAMRecord corresponding to read + * @param it + * iterator over insertions which coincide with rec + * @param override + * an optional which can override a + * in it to change the length. Set to null if not + * used. + * @param seq + * @return + */ + private String applyCigarOp(CigarElement el, int next, + SAMRecord rec, Iterator> it, + int[] override, SequenceI seq) + { + StringBuilder newRead = new StringBuilder(); + String read = rec.getReadString(); + + int length = el.getLength(); + int nextPos = next; + + switch (el.getOperator()) + { + case M: + case X: + case EQ: + // matched or mismatched residues + newRead = applyCigarConsumeReadAndRef(next, length, rec, it, + override, seq); + break; + case N: // intron in RNA + case D: // deletion + int insertCount = 0; + while (it.hasNext()) + { + // just count all the insertions we need to add in the deletion/intron + // region + Entry entry = it.next(); + insertCount += entry.getValue(); + } + addGaps(newRead, length + insertCount); + break; + case I: + // the reference sequence and other reads should have been gapped for + // this insertion, so just add in the residues + newRead.append(read.substring(nextPos, nextPos + length)); + seq.addSequenceFeature(new SequenceFeature("INSERTION", "", + nextPos + 1, nextPos + length, "bamfile")); + break; + case S: + // soft clipping - just skip this bit of the read + seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "", + nextPos + 1, nextPos + length, "bamfile")); + break; + case H: + // hard clipping - this stretch will not appear in the read + seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "", + nextPos + 1, nextPos + length, "bamfile")); + break; + default: + break; + } + + return newRead.toString(); + } + + /** + * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X + * + * @param next + * next position in read + * @param length + * length of cigar operation + * @param rec + * SAMRecord to apply the CIGAR op to + * @param it + * iterator over insertions in the CIGAR region + * @param override + * insertions which have already been accounted for + * @param seq + * @return + */ + private StringBuilder applyCigarConsumeReadAndRef(int next, int length, + SAMRecord rec, Iterator> it, + int[] override, SequenceI seq) + { + StringBuilder newRead = new StringBuilder(); + String read = rec.getReadString(); + + int nextPos = next; // location of next position in read + while (nextPos < next + length) + { + int insertPos = next + length; // location of next insert in read (or end + // of read) + int insertLen = 0; // no of gaps to insert after insertPos + if (it.hasNext()) + { + // account for sequence already having insertion + // if an override entry exists it gives the number of positions still to + // be inserted at this point, after accounting for insertions driven by + // the current read. + // e.g. if the current read has 2 inserted positions here, and the + // insertions list also has 2 inserted positions, the override will + // cancel the insert for this read. If the insertions list had 3 + // insertions, the override would reduce this to 1 insertion. + Entry entry = it.next(); + int key = entry.getKey(); + insertLen = entry.getValue(); + if (override != null && key == override[0]) + { + insertLen = override[1]; + } + if (insertLen > 0) + { + insertPos = rec.getReadPositionAtReferencePosition(key) - 1; + } + else + { + insertPos = nextPos; // bugfix - trigger by override + insertions in + // same M stretch + } + } + newRead.append(read.substring(nextPos, insertPos)); + nextPos = insertPos; + addGaps(newRead, insertLen); // add insertions + } + return newRead; + } + + /** + * Get list of positions inserted to the reference sequence. Note: assumes all + * reads are on the same chromosome. + * + * @param it + * Iterator over all the SAMRecords + * @return sorted map of insertion positions + */ + /* + 1234567 + ABCDEFG insert 3,4 + + R1: insert 3 = gap before 3rd res + AB.CDEFG + + R2: insert 4 = gap before 4th res + ABC.DEFG + + R3: insert 3,4 = 2 gaps before 3rd res + AB..CDEFG + + => AB--C-DEFG + + So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1) + Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added + + */ + public SortedMap[] getInsertions(Iterator it) + { + return getInsertions(it, null); + } + + public int firstAlColumn = -1, firstRposCol = -1; + + public SortedMap[] getInsertions(Iterator it, + Range[] extent) + { + SortedMap sinserts[] = new SortedMap[] { + new TreeMap<>(), new TreeMap<>() }; + Range xten = extent != null && extent[0] != null ? extent[0] : null; + do + { + // check extent of read + SAMRecord rec = it.next(); + if (extent != null) + { + + int nstart = rec.getAlignmentStart(); + if (nstart != 0) + { + nstart = rec.getReferencePositionAtReadPosition(nstart); + } + int nend = rec.getAlignmentEnd(); + if (nend != 0) + { + nend = rec.getReferencePositionAtReadPosition(nend); + } + + xten = new Range( + nstart <= 0 ? xten.start : Math.min(nstart, xten.start), + nend == 0 ? xten.end : Math.max(nend, xten.end)); + } + + // check each record for insertions in the CIGAR string + + // pick the insert map to update + int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0; + SortedMap inserts = sinserts[insmap]; + + Iterator cit = rec.getCigar().getCigarElements() + .iterator(); + int refLocation = rec.getAlignmentStart(); + int rloc = 0; + int alcol = 0; + int alpos = 0; + while (cit.hasNext()) + { + CigarElement el = cit.next(); + switch (el.getOperator()) + { + case S: + case H: + // ignore soft/hardclipped + + break; + default: + if (el.getOperator().consumesReadBases() + && !el.getOperator().consumesReferenceBases()) + { + // add to insertions list, and move along the read + // location is the start of next CIGAR segment + // because getReferencePositionAtReadPosition returns 0 for read + // positions which are not in the reference (like insertions) + + // if there's already an insertion at this location, keep the + // longest + // insertion; if there's no insertion keep this one + if (!inserts.containsKey(refLocation) + || (inserts.containsKey(refLocation) + && inserts.get(refLocation) < el.getLength())) + { + inserts.put(refLocation, el.getLength()); + } + break; + } + if (el.getOperator().consumesReferenceBases()) + { + if (el.getOperator().consumesReadBases() && alcol == 0 + && refLocation + 1 > extent[0].start) + { + // first match position : last rloc + first match op + alcol = rloc + 1; + alpos = rec.getReferencePositionAtReadPosition(rloc + 1); + } + refLocation += el.getLength(); + } + + } + if (el.getOperator().consumesReadBases() + || el.getOperator().consumesReferenceBases()) + { + rloc += el.getLength(); + } + } + // Wrong: hack that fails to relocate reference to correct place in + // sequence + if (firstAlColumn < 0) + { + firstAlColumn = alcol; + firstRposCol = alpos; + } + } while (it.hasNext()); + if (xten != null) + { + extent[0] = xten; + } + return sinserts; + } + + /** + * Add sequence of gaps to a read + * + * @param read + * read to update + * @param length + * number of gaps + */ + private void addGaps(StringBuilder read, int length) + { + if (length > 0) + { + char[] gaps = new char[length]; + Arrays.fill(gaps, gapChar); + read.append(gaps); + } + } + + /** + * Get the number of insertions before the start of an aligned read (i.e. + * insertions in soft clipped region are counted too) + * + * @param rec + * the SAMRecord for the read + * @param inserts + * the map of insertions + * @return number of insertions before the read starts + */ + private int countInsertionsBeforeRead(SAMRecord rec, + SortedMap inserts) + { + int gaps = 0; + + // add in any insertion gaps before read + // although we only need to get the submap from alignmentStart, there won't + // be any insertions before that so we can just start at 0 + SortedMap seqInserts = inserts.subMap(0, + rec.getStart()); + + Iterator> it = seqInserts.entrySet() + .iterator(); + while (it.hasNext()) + { + Entry entry = it.next(); + gaps += entry.getValue(); + } + return gaps; + } +} diff --git a/src/jalview/datamodel/SeqCigar.java b/src/jalview/datamodel/SeqCigar.java index c2a6a9c..afe10fc 100644 --- a/src/jalview/datamodel/SeqCigar.java +++ b/src/jalview/datamodel/SeqCigar.java @@ -467,25 +467,6 @@ public class SeqCigar extends CigarSimple } /** - * Create a cigar object from a cigar string like '[]+' Will - * fail if the given seq already contains gaps (JBPNote: future implementation - * will fix) - * - * @param seq - * SequenceI object resolvable to a dataset sequence - * @param cigarString - * String - * @return Cigar - */ - public static SeqCigar parseCigar(SequenceI seq, String cigarString) - throws Exception - { - Object[] opsandrange = parseCigarString(cigarString); - return new SeqCigar(seq, (char[]) opsandrange[0], - (int[]) opsandrange[1]); - } - - /** * create an alignment from the given array of cigar sequences and gap * character, and marking the given segments as visible in the given * hiddenColumns. diff --git a/src/jalview/gui/BamFileOptionsChooser.java b/src/jalview/gui/BamFileOptionsChooser.java new file mode 100644 index 0000000..f456319 --- /dev/null +++ b/src/jalview/gui/BamFileOptionsChooser.java @@ -0,0 +1,125 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.gui; + +import jalview.io.AlignmentFileReaderI; +import jalview.io.BamFile; +import jalview.util.MessageManager; + +import java.awt.GridBagConstraints; +import java.awt.GridBagLayout; +import java.util.Arrays; + +import javax.swing.BorderFactory; +import javax.swing.JComboBox; +import javax.swing.JLabel; +import javax.swing.JPanel; +import javax.swing.JTextField; + +/** + * A dialog where a user can choose import options for a bam file + */ +public class BamFileOptionsChooser extends JPanel +{ + + // text field for start of range + // JFormattedTextField startRange = new JFormattedTextField( + // new NumberFormatter(NumberFormat.getIntegerInstance())); + + // text field for end of range + // JFormattedTextField endRange = new JFormattedTextField( + // new NumberFormatter(NumberFormat.getIntegerInstance())); + + JTextField startRange = new JTextField(10); + + JTextField endRange = new JTextField(10); + + // combo box for chromosome list + JComboBox chroCombo; + + public BamFileOptionsChooser(AlignmentFileReaderI source) + { + Object[] preprocessResult = source.preprocess(); + String[] chromosomes = Arrays.copyOf(preprocessResult, + preprocessResult.length, String[].class); + + setLayout(new GridBagLayout()); + GridBagConstraints c = new GridBagConstraints(); + + setBorder(BorderFactory.createEmptyBorder(0, 10, 10, 10)); + + // set up chromosome input + JLabel chroLabel = new JLabel( + MessageManager.getString("label.bam_chromosome")); + chroCombo = new JComboBox<>(chromosomes); + + // set up region input + JPanel range = new JPanel(); + JLabel rangeLabel = new JLabel( + MessageManager.getString("label.bam_range")); + startRange.setColumns(8); + endRange.setColumns(8); + + JLabel rangeSep = new JLabel("-"); + range.add(startRange); + range.add(rangeSep); + range.add(endRange); + + // chromosome label top left + c.gridx = 0; + c.gridy = 0; + c.anchor = GridBagConstraints.WEST; + c.fill = GridBagConstraints.NONE; + this.add(chroLabel, c); + + // chromosome combo top right + c.gridx = 1; + c.weightx = 1; + c.anchor = GridBagConstraints.WEST; + c.fill = GridBagConstraints.HORIZONTAL; + this.add(chroCombo, c); + + // region label mid left + c.gridx = 0; + c.gridy = 1; + c.anchor = GridBagConstraints.WEST; + c.fill = GridBagConstraints.NONE; + this.add(rangeLabel, c); + + // region input mid right + c.gridx = 1; + c.weightx = 1; + c.anchor = GridBagConstraints.WEST; + c.fill = GridBagConstraints.HORIZONTAL; + this.add(range, c); + } + + public void update(AlignmentFileReaderI source) + { + + int start = Integer.parseInt(startRange.getText()); + int end = Integer.parseInt(endRange.getText()); + ((BamFile) source).setOptions((String) chroCombo.getSelectedItem(), + start, end); + + } + +} diff --git a/src/jalview/io/AlignFile.java b/src/jalview/io/AlignFile.java index 2340283..c931cb8 100755 --- a/src/jalview/io/AlignFile.java +++ b/src/jalview/io/AlignFile.java @@ -304,9 +304,9 @@ public abstract class AlignFile extends FileParse */ protected void initData() { - seqs = new Vector(); - annotations = new Vector(); - seqGroups = new ArrayList(); + seqs = new Vector<>(); + annotations = new Vector<>(); + seqGroups = new ArrayList<>(); parseCalled = false; } @@ -319,7 +319,7 @@ public abstract class AlignFile extends FileParse @Override public void setSeqs(SequenceI[] s) { - seqs = new Vector(); + seqs = new Vector<>(); for (int i = 0; i < s.length; i++) { @@ -390,7 +390,7 @@ public abstract class AlignFile extends FileParse { if (newickStrings == null) { - newickStrings = new Vector(); + newickStrings = new Vector<>(); } newickStrings.addElement(new String[] { treeName, newickString }); } @@ -414,4 +414,13 @@ public abstract class AlignFile extends FileParse { seqs.add(seq); } + + @Override + public Object[] preprocess() + { + // most AlignFiles will not need to return any preprocessing information + // those that do should override this method + return null; + } + } diff --git a/src/jalview/io/AlignmentFileReaderI.java b/src/jalview/io/AlignmentFileReaderI.java index a471d9b..fe47c8a 100644 --- a/src/jalview/io/AlignmentFileReaderI.java +++ b/src/jalview/io/AlignmentFileReaderI.java @@ -20,12 +20,12 @@ */ package jalview.io; -import jalview.api.AlignExportSettingI; -import jalview.api.AlignmentViewPanel; import jalview.api.FeatureSettingsModelI; import jalview.datamodel.AlignmentI; import jalview.datamodel.SequenceI; +import java.io.IOException; + public interface AlignmentFileReaderI { @@ -47,4 +47,13 @@ public interface AlignmentFileReaderI FeatureSettingsModelI getFeatureColourScheme(); + /** + * Get a string array resulting from preprocessing the file + * + * @return preprocess result + */ + Object[] preprocess(); + + public abstract void parse() throws IOException; + } diff --git a/src/jalview/io/AppletFormatAdapter.java b/src/jalview/io/AppletFormatAdapter.java index 5e209e6..da78420 100755 --- a/src/jalview/io/AppletFormatAdapter.java +++ b/src/jalview/io/AppletFormatAdapter.java @@ -147,6 +147,20 @@ public class AppletFormatAdapter public AlignmentI readFile(String file, DataSourceType sourceType, FileFormatI fileFormat) throws IOException { + if (alignFile == null) + { + prepareFileReader(file, sourceType, fileFormat); + } + else + { + alignFile.parse(); + } + return buildAlignmentFromFile(); + } + + public void prepareFileReader(String file, DataSourceType sourceType, + FileFormatI fileFormat) throws IOException + { this.inFile = file; try { @@ -176,10 +190,9 @@ public class AppletFormatAdapter } else { - // alignFile = fileFormat.getAlignmentFile(inFile, sourceType); alignFile = fileFormat.getReader(new FileParse(inFile, sourceType)); } - return buildAlignmentFromFile(); + return; } catch (Exception e) { e.printStackTrace(); @@ -200,7 +213,7 @@ public class AppletFormatAdapter // Possible sequence is just residues with no label alignFile = new FastaFile(">UNKNOWN\n" + inFile, DataSourceType.PASTE); - return buildAlignmentFromFile(); + return; } catch (Exception ex) { diff --git a/src/jalview/io/BamFile.java b/src/jalview/io/BamFile.java new file mode 100644 index 0000000..8d55c53 --- /dev/null +++ b/src/jalview/io/BamFile.java @@ -0,0 +1,322 @@ +/* + * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$) + * Copyright (C) $$Year-Rel$$ The Jalview Authors + * + * This file is part of Jalview. + * + * Jalview is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, either version 3 + * of the License, or (at your option) any later version. + * + * Jalview is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Jalview. If not, see . + * The Jalview Authors are detailed in the 'AUTHORS' file. + */ +package jalview.io; + +import jalview.datamodel.CigarParser; +import jalview.datamodel.Range; +import jalview.datamodel.Sequence; +import jalview.datamodel.SequenceFeature; +import jalview.datamodel.SequenceI; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.PrimitiveIterator.OfInt; +import java.util.SortedMap; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.SamInputResource; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.ValidationStringency; + +public class BamFile extends AlignFile +{ + // SAM/BAM file reader + private SamReader fileReader; + + // start position to read from + private int start = -1; + + // end position to read to + private int end = -1; + + // chromosome/contig to read + private String chromosome = ""; + + // first position in alignment + private int alignmentStart = -1; + + /** + * Creates a new BamFile object. + */ + public BamFile() + { + } + + /** + * Creates a new BamFile object. + * + * @param inFile + * Name of file to read + * @param sourceType + * Whether data source is file, url or other type of data source + * + * @throws IOException + */ + public BamFile(String inFile, DataSourceType sourceType) + throws IOException + { + super(true, inFile, sourceType); + final SamReaderFactory factory = SamReaderFactory.makeDefault() + .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, + SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) + .validationStringency(ValidationStringency.SILENT); + fileReader = factory.open(new File(inFile)); + } + + /** + * Creates a new BamFile object + * + * @param source + * wrapper for datasource + * @throws IOException + */ + public BamFile(FileParse source) throws IOException + { + super(true, source); + parseSuffix(); + final SamReaderFactory factory = SamReaderFactory.makeDefault() + .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, + SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS) + .validationStringency(ValidationStringency.SILENT); + + // File-based bam + if (source.getDataSourceType() == DataSourceType.FILE) + { + fileReader = factory.open(source.inFile); + } + else + { + // locate index ? + String index = source.getDataName() + ".bai"; + fileReader = factory.open(SamInputResource.of(source.getDataName()) + .index(new URL(index))); + } + } + + @Override + public String print(SequenceI[] seqs, boolean jvsuffix) + { + // TODO Auto-generated method stub + return null; + } + + private StringBuilder insertRefSeq( + SortedMap insertions, Range xtent) + { + StringBuilder refseq = new StringBuilder(); + int inserted = 0; + for (int p = xtent.start; p < xtent.end; p++) + { + refseq.append('N'); + } + for (Map.Entry insert : insertions.entrySet()) + { + int inspos = insert.getKey() - xtent.start + inserted; + if (inspos < 0) + { + System.out.println("Ignoring -ve insert position " + insert.getKey() + + " of " + insert.getValue() + + " (alpos: " + inspos + ")"); + inspos = 0; + // continue; + } + for (int i = 0, j = insert.getValue(); i < j; i++) + { + inserted++; + refseq.insert(inspos, '-'); + } + } + return refseq; + } + + private void padRef(SequenceI ref, CigarParser cig) + { // pad to column + int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol); + System.out.println("Padding " + padding + " to move refseq position " + + cig.firstRposCol + " to " + cig.firstAlColumn + "col."); + + ref.insertCharAt(0, padding, '-'); + } + + @Override + public void parse() + { + // only actually parse if params are set + if (chromosome != null && chromosome != "") + { + SAMRecordIterator it = fileReader.query(chromosome, start, end, + false); + CigarParser parser = new CigarParser('-'); + Range[] xtent = new Range[] { new Range(start, end) }; + SortedMap insertions[] = parser.getInsertions(it, + xtent); + it.close(); + + SequenceI refSeq = new Sequence("chr:" + chromosome, + insertRefSeq(insertions[0], xtent[0]) + .toString()); + + refSeq.setStart(xtent[0].start); + refSeq.setEnd(xtent[0].end); + SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome, + insertRefSeq(insertions[1], xtent[0]) + .toString()); + revRefSeq.setStart(xtent[0].start); + revRefSeq.setEnd(xtent[0].end); + + // Hack to move the seqs along + padRef(refSeq, parser); + padRef(revRefSeq, parser); + + it = fileReader.query(chromosome, start, end, false); + + ArrayList fwd = new ArrayList(), rev = new ArrayList(); + while (it.hasNext()) + { + SAMRecord rec = it.next(); + + // set the alignment start to be start of first read (we assume reads + // are sorted) + if (alignmentStart == -1) + { + alignmentStart = rec.getAlignmentStart(); + } + + // make dataset sequence: start at 1, end at read length + SequenceI seq = new Sequence( + "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "") + + rec.getReadName(), + rec.getReadString().toLowerCase()); + seq.setStart(1); + seq.setEnd(rec.getReadLength()); + OfInt q = rec.getBaseQualityString().chars() + .iterator(); + int p = seq.getStart(); + while (q.hasNext()) + { + seq.addSequenceFeature(new SequenceFeature("QUALITY", "", p, p, + (float) q.next() - ' ', "bamfile")); + p++; + } + String newRead = parser.parseCigarToSequence(rec, + insertions[rec.getReadNegativeStrandFlag() ? 1 : 0], + alignmentStart, seq); + + // make alignment sequences + SequenceI alsq = seq.deriveSequence(); + alsq.setSequence(newRead); + + // set start relative to soft clip; assume end is set by Sequence code + alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1); + if (rec.getReadNegativeStrandFlag()) + { + rev.add(alsq); + } + else + { + fwd.add(alsq); + } + } + // Add forward + if (fwd.size() > 0) + { + seqs.add(refSeq); // FIXME needs to be optional, and properly padded + seqs.addAll(fwd); + fwd.clear(); + } + // and reverse strand reads. + if (rev.size() > 0) + { + seqs.add(revRefSeq); // FIXME needs to be optional and properly padded + seqs.addAll(rev); + rev.clear(); + } + } + } + + /** + * Get the list of chromosomes or contigs from the file (listed in SQ entries + * in BAM file header) + * + * @return array of chromosome/contig strings + */ + @Override + public Object[] preprocess() + { + List refSeqs = fileReader.getFileHeader() + .getSequenceDictionary().getSequences(); + List chrs = new ArrayList<>(); + + for (SAMSequenceRecord ref : refSeqs) + { + chrs.add(ref.getSequenceName()); + } + return chrs.toArray(); + } + + public void setOptions(String chr, int s, int e) + { + chromosome = chr; + start = s; + end = e; + suffix = chromosome + ":" + start + "-" + end; + } + + public boolean parseSuffix() + { + if (suffix == null) + { + return false; + } + int csep = suffix.indexOf(":"); + int rsep = suffix.indexOf("-", csep); + if (csep < 0 || rsep < 0 || suffix.length() - rsep <= 1) + { + return false; + } + String chr, p1, p2; + chr = suffix.substring(0, csep); + p1 = suffix.substring(csep + 1, rsep); + p2 = suffix.substring(rsep + 1); + int cstart, cend; + try + { + cstart = Integer.parseInt(p1); + cend = Integer.parseInt(p2); + } catch (Exception e) + { + warningMessage = (warningMessage == null ? "" : warningMessage + "\n") + + "Couldn't parse range from " + suffix; + return false; + } + chromosome = chr; + start = cstart; + end = cend; + return true; + } +} diff --git a/src/jalview/io/FileFormat.java b/src/jalview/io/FileFormat.java index 4b33dbf..21fadd2 100644 --- a/src/jalview/io/FileFormat.java +++ b/src/jalview/io/FileFormat.java @@ -347,6 +347,27 @@ public enum FileFormat implements FileFormatI return true; } }, + Bam("bam", "bam", true, true) + { + @Override + public AlignmentFileReaderI getReader(FileParse source) + throws IOException + { + return new BamFile(source); + } + + @Override + public AlignmentFileWriterI getWriter(AlignmentI al) + { + return new BamFile(); + } + + @Override + public boolean isStructureFile() + { + return false; + } + }, Jalview("Jalview", "jar,jvp", true, true) { @Override diff --git a/src/jalview/io/FileLoader.java b/src/jalview/io/FileLoader.java index f26d6da..8eb60e5 100755 --- a/src/jalview/io/FileLoader.java +++ b/src/jalview/io/FileLoader.java @@ -32,6 +32,7 @@ import jalview.datamodel.PDBEntry; import jalview.datamodel.SequenceI; import jalview.gui.AlignFrame; import jalview.gui.AlignViewport; +import jalview.gui.BamFileOptionsChooser; import jalview.gui.Desktop; import jalview.gui.Jalview2XML; import jalview.gui.JvOptionPane; @@ -308,6 +309,43 @@ public class FileLoader implements Runnable loadtime = -System.currentTimeMillis(); AlignmentI al = null; + if (FileFormat.Bam.equals(format)) + { + FormatAdapter fa = new FormatAdapter(); + if (source == null) + { + fa.prepareFileReader(file, protocol, format); + source = fa.getAlignFile(); + } + if (!((BamFile) source).parseSuffix()) + { + // configure a window + BamFileOptionsChooser bamoptions = new BamFileOptionsChooser( + source); + // ask the user which bit of the bam they want to load + int confirm = JvOptionPane.showConfirmDialog(null, bamoptions, + MessageManager.getString("label.bam_file_options"), + JvOptionPane.OK_CANCEL_OPTION, + JvOptionPane.PLAIN_MESSAGE); + + if (confirm == JvOptionPane.CANCEL_OPTION + || confirm == JvOptionPane.CLOSED_OPTION) + { + Desktop.instance.stopLoading(); + return; + } + else + { + bamoptions.update(source); + if (file.indexOf("#") == -1) + { + file = file + "#" + ((BamFile) source).suffix; + } + } + } + al = fa.readFile(file, protocol, format); + } + if (FileFormat.Jalview.equals(format)) { if (source != null) @@ -324,37 +362,40 @@ public class FileLoader implements Runnable String error = AppletFormatAdapter.getSupportedFormats(); try { - if (source != null) - { - // read from the provided source - al = new FormatAdapter().readFromFile(source, format); - } - else + if (al == null) { - - // open a new source and read from it - FormatAdapter fa = new FormatAdapter(); - boolean downloadStructureFile = format.isStructureFile() - && protocol.equals(DataSourceType.URL); - if (downloadStructureFile) + if (source != null) { - String structExt = format.getExtensions().split(",")[0]; - String urlLeafName = file.substring( - file.lastIndexOf( - System.getProperty("file.separator")), - file.lastIndexOf(".")); - String tempStructureFileStr = createNamedJvTempFile( - urlLeafName, structExt); - UrlDownloadClient.download(file, tempStructureFileStr); - al = fa.readFile(tempStructureFileStr, DataSourceType.FILE, - format); - source = fa.getAlignFile(); + // read from the provided source + al = new FormatAdapter().readFromFile(source, format); } else { - al = fa.readFile(file, protocol, format); - source = fa.getAlignFile(); // keep reference for later if - // necessary. + + // open a new source and read from it + FormatAdapter fa = new FormatAdapter(); + boolean downloadStructureFile = format.isStructureFile() + && protocol.equals(DataSourceType.URL); + if (downloadStructureFile) + { + String structExt = format.getExtensions().split(",")[0]; + String urlLeafName = file.substring( + file.lastIndexOf( + System.getProperty("file.separator")), + file.lastIndexOf(".")); + String tempStructureFileStr = createNamedJvTempFile( + urlLeafName, structExt); + UrlDownloadClient.download(file, tempStructureFileStr); + al = fa.readFile(tempStructureFileStr, DataSourceType.FILE, + format); + source = fa.getAlignFile(); + } + else + { + al = fa.readFile(file, protocol, format); + source = fa.getAlignFile(); // keep reference for later if + // necessary. + } } } } catch (java.io.IOException ex) diff --git a/src/jalview/io/FileParse.java b/src/jalview/io/FileParse.java index c0328d5..f077e21 100755 --- a/src/jalview/io/FileParse.java +++ b/src/jalview/io/FileParse.java @@ -344,7 +344,8 @@ public class FileParse checkURLSource(fileStr); if (suffixSeparator == '#') { - extractSuffix(fileStr); // URL lref is stored for later reference. + dataName = extractSuffix(fileStr); // URL lref is stored for later + // reference. } } catch (IOException e) { diff --git a/src/jalview/io/FormatAdapter.java b/src/jalview/io/FormatAdapter.java index 6d3c18a..bc3a560 100755 --- a/src/jalview/io/FormatAdapter.java +++ b/src/jalview/io/FormatAdapter.java @@ -266,5 +266,4 @@ public class FormatAdapter extends AppletFormatAdapter source.getDataSourceType()); return readFromFile(fp, format); } - } diff --git a/src/jalview/io/IdentifyFile.java b/src/jalview/io/IdentifyFile.java index ff959b0..e65101d 100755 --- a/src/jalview/io/IdentifyFile.java +++ b/src/jalview/io/IdentifyFile.java @@ -127,14 +127,21 @@ public class IdentifyFile { // jar files are special - since they contain all sorts of random // characters. - if (source.inFile != null) + if (source.inFile != null || source.getDataName() != null) { - String fileStr = source.inFile.getName(); + String fileStr = source.inFile == null ? source.getDataName() + : source.inFile.getName(); // possibly a Jalview archive. if (fileStr.lastIndexOf(".jar") > -1 || fileStr.lastIndexOf(".zip") > -1) { reply = FileFormat.Jalview; + // TODO shouldn't there be a break here? + } + else if (fileStr.lastIndexOf(".bam") > -1) + { + reply = FileFormat.Bam; + break; } } if (!lineswereskipped && data.startsWith("PK")) diff --git a/test/jalview/datamodel/CigarParserTest.java b/test/jalview/datamodel/CigarParserTest.java new file mode 100644 index 0000000..80121ee --- /dev/null +++ b/test/jalview/datamodel/CigarParserTest.java @@ -0,0 +1,146 @@ +package jalview.datamodel; + +import java.util.Iterator; +import java.util.SortedMap; +import java.util.TreeMap; + +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordSetBuilder; + +public class CigarParserTest +{ + @BeforeClass(alwaysRun = true) + public void setup() + { + + + + } + + @DataProvider(name = "reads") + public Object[][] createReadsData() + { + SortedMap noinsertions = new TreeMap<>(); + + SortedMap insertions = new TreeMap<>(); + insertions.put(8, 3); + insertions.put(105, 2); + + SortedMap insertions2 = new TreeMap<>(); + insertions2.put(11, 2); + + SortedMap insertions3 = new TreeMap<>(); + insertions3.put(8, 3); + insertions3.put(105, 3); + + SortedMap insertions4 = new TreeMap<>(); + insertions4.put(8, 3); + insertions4.put(105, 2); + insertions4.put(109, 3); + insertions4.put(112, 1); + + String read = "CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC"; + + return new Object[][] { { "1S84M2I14M", read, 21, + "-----------------------GAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC", + insertions }, // first residue is G (accounting for C soft clip) at + // position 21 + 3 (insertions at position 8) + { "1S84M2I14M", read, 21, + "-----------------------GAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGA-GGAGCTCGTTGGTC", + insertions3 }, // read has 2 insertions accounted for in + // insertions3, 3rd insertion is added as gap at + // position 105 + { "1S84M2I14M", read, 21, + "-----------------------GAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAG---CTC-GTTGGTC", + insertions4 }, // 2 insertions in read accounted for at position + // 105; 3 insertions at 109 and 1 insertion at 112 + { "44M1D57M", + read, + 3, + "--CGAAG---CTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTG-AAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC", + insertions }, + { "101M", + read, 4, + "---CGAA---GCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC", + insertions }, + { "6M2D76M19S", + "CGAAGCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTCCC", + 4, + "---CGAAGC----TTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGA", + insertions2 }, + + { "44M1D57M", + read, + 3, + "--CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTG-AAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC", + noinsertions }, + { "101M", + read, 4, + "---CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC", + noinsertions }, + { "5S96M", read, 7, + "------CTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC", + noinsertions }, + { "96M5H", read, 7, + "------CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGT", + noinsertions }, }; + } + + @Test(dataProvider = "reads", groups = { "Functional" }) + public void testParse(String cigar, String read, int start, String result, + SortedMap insertions) + { + SAMRecord rec = new SAMRecord(null); + rec.setCigarString(cigar); + rec.setReadString(read); + rec.setAlignmentStart(start); + + CigarParser cp = new CigarParser('-'); + String bfresult = cp.parseCigarToSequence(rec, insertions, 1); + + System.out.println(result); + System.out.println(bfresult); + Assert.assertEquals(bfresult, result); + } + + @Test(groups = { "Functional" }) + public void testGetInsertions() + { + final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(); + builder.addFrag("read_1", 22, 30000, false, false, + "101M", "", 0); + builder.addFrag("read_2", 22, 28835, false, false, + "50M3I48M", "", 0); + builder.addFrag("read_3", 22, 28835, false, false, "3M1I75M2I1M", "", + 0); + builder.addFrag("read_4", 22, 28865, false, false, "48M3I49M", "", 0); + builder.addFrag("read_5", 22, 28865, false, false, "49M3I47M2D2M", "", + 0); + builder.addFrag("read_6", 22, 27000, false, false, "2M4I90M5S", "", 0); + builder.addFrag("read_7", 22, 27000, false, false, "2M1I98M", "", 0); + + builder.addFrag("read_8", 22, 27000, false, false, "3M200N2I5M", "", 0); + + Iterator it = builder.iterator(); + CigarParser cp = new CigarParser('-'); + SortedMap insertions = cp.getInsertions(it); + Assert.assertEquals(insertions.size(), 6); + Assert.assertTrue(insertions.containsKey(28838)); + Assert.assertEquals((int) insertions.get(28838), 1); + Assert.assertTrue(insertions.containsKey(28885)); + Assert.assertEquals((int) insertions.get(28885), 3); + Assert.assertTrue(insertions.containsKey(28913)); + Assert.assertEquals((int) insertions.get(28913), 3); + Assert.assertTrue(insertions.containsKey(28914)); + Assert.assertEquals((int) insertions.get(28914), 3); + Assert.assertTrue(insertions.containsKey(27002)); + Assert.assertEquals((int) insertions.get(27002), 4); + Assert.assertTrue(insertions.containsKey(27203)); + Assert.assertEquals((int) insertions.get(27203), 2); + } +} diff --git a/test/jalview/datamodel/SeqCigarTest.java b/test/jalview/datamodel/SeqCigarTest.java index 89169d6..57f1156 100644 --- a/test/jalview/datamodel/SeqCigarTest.java +++ b/test/jalview/datamodel/SeqCigarTest.java @@ -90,11 +90,6 @@ public class SeqCigarTest assertEquals("Failed to recover ungapped sequence cigar operations", "42M", cs_null); testCigar_string(s_gapped, ex_cs_gapped); - SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped); - assertEquals("Failed parseCigar", ex_cs_gapped, - gen_sgapped.getCigarstring()); - - testSeqRecovery(gen_sgapped, s_gapped); /* * Test dataset resolution diff --git a/utils/checkstyle/import-control.xml b/utils/checkstyle/import-control.xml index c47aaec..0babda8 100644 --- a/utils/checkstyle/import-control.xml +++ b/utils/checkstyle/import-control.xml @@ -37,6 +37,7 @@ + @@ -116,6 +117,7 @@ +