<classpathentry kind="con" path="org.testng.TESTNG_CONTAINER"/>
<classpathentry kind="lib" path="lib/biojava-core-4.1.0.jar"/>
<classpathentry kind="lib" path="lib/biojava-ontology-4.1.0.jar"/>
- <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
<classpathentry kind="lib" path="lib/htsjdk-2.12.0.jar"/>
<classpathentry kind="lib" path="lib/groovy-all-2.4.12-indy.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
label.most_polymer_residues = Most Polymer Residues
label.cached_structures = Cached Structures
label.free_text_search = Free Text Search
+label.bam_file_options = BAM File Options
+label.bam_chromosome = Chromosome to load:
+label.bam_range = Region to load:
+warn.bam_params_not_set = .bam file region parameters have not been set
}
/**
- * turn a cigar string into a series of operation range pairs
- *
- * @param cigarString
- * String
- * @return object[] {char[] operation, int[] range}
- * @throws java.lang.Exception
- * for improperly formated cigar strings or ones with unknown
- * operations
- */
- public static Object[] parseCigarString(String cigarString)
- throws Exception
- {
- int ops = 0;
- for (int i = 0, l = cigarString.length(); i < l; i++)
- {
- char c = cigarString.charAt(i);
- if (c == M || c == (M - _case_shift) || c == I
- || c == (I - _case_shift) || c == D || c == (D - _case_shift))
- {
- ops++;
- }
- }
- char[] operation = new char[ops];
- int[] range = new int[ops];
- int op = 0;
- int i = 0, l = cigarString.length();
- while (i < l)
- {
- char c;
- int j = i;
- do
- {
- c = cigarString.charAt(j++);
- } while (c >= '0' && c <= '9' && j < l);
- if (j >= l && c >= '0' && c <= '9')
- {
- throw new Exception(MessageManager
- .getString("exception.unterminated_cigar_string"));
- }
- try
- {
- String rangeint = cigarString.substring(i, j - 1);
- range[op] = Integer.parseInt(rangeint);
- i = j;
- } catch (Exception e)
- {
- throw new Error(MessageManager
- .getString("error.implementation_bug_parse_cigar_string"));
- }
- if (c >= 'a' && c <= 'z')
- {
- c -= _case_shift;
- }
- if ((c == M || c == I || c == D))
- {
- operation[op++] = c;
- }
- else
- {
- throw new Exception(MessageManager.formatMessage(
- "exception.unexpected_operation_cigar_string_pos",
- new String[]
- { new StringBuffer(c).toString(),
- Integer.valueOf(i).toString(), cigarString }));
- }
- }
- return new Object[] { operation, range };
- }
-
- /**
* add an operation to cigar string
*
* @param op
--- /dev/null
+package jalview.datamodel;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMRecord;
+
+public class CigarParser
+{
+ private final char gapChar;
+
+ public CigarParser(char gap)
+ {
+ gapChar = gap;
+ }
+
+ /**
+ * Apply the CIGAR string to a read sequence and return the updated read.
+ *
+ * @param rec
+ * the SAM record for the read
+ * @param insertions
+ * a map of inserted positions and lengths
+ * @param alignmentStart
+ * start of alignment to be used as offset when padding reads with
+ * gaps
+ * @param seq
+ * sequence to be annotated with inserts
+ * @return string representing read with gaps, clipping etc applied
+ */
+ public String parseCigarToSequence(SAMRecord rec,
+ SortedMap<Integer, Integer> insertions, int alignmentStart,
+ SequenceI seq)
+ {
+ StringBuilder newRead = new StringBuilder();
+ Iterator<CigarElement> it = rec.getCigar().getCigarElements()
+ .iterator();
+
+ int next = 0;
+ int refnext = 0;
+
+ // pad with spaces before read
+ // first count gaps needed to pad to reference position
+ // NB position is 1-based so number of gaps = pos-1
+ int gaps = rec.getStart() - alignmentStart;
+
+ // now count gaps to pad for insertions in other reads
+ int insertCount = countInsertionsBeforeRead(rec, insertions);
+ addGaps(newRead, gaps + insertCount);
+
+ int lastinserts = 0;
+ while (it.hasNext())
+ {
+ CigarElement el = it.next();
+ int length = el.getLength();
+
+ // which insertions coincide with current CIGAR operation?
+ SortedMap<Integer, Integer> seqInserts = insertions.subMap(
+ rec.getStart() + refnext, rec.getStart() + refnext + length);
+
+ int[] override = null;
+ if (lastinserts > 0)
+ {
+ if (!seqInserts.containsKey(rec.getStart() + refnext))
+ {
+ // ERROR
+ int pos = rec.getStart() + refnext;
+ System.out.println(
+ "Read " + rec.getReadName() + " has an insertion at "
+ + pos + " which is not in seqInserts");
+ }
+ else
+ {
+ // we just inserted something
+ // remove these inserts now - should be very first entry
+ int count = seqInserts.get(rec.getStart() + refnext);
+ override = new int[] { rec.getStart() + refnext,
+ count - lastinserts };
+ lastinserts = 0;
+ }
+ }
+
+ Iterator<Map.Entry<Integer, Integer>> iit = seqInserts.entrySet()
+ .iterator();
+
+ String nextSegment = applyCigarOp(el, next, rec, iit, override, seq);
+ newRead.append(nextSegment);
+
+ if (el.getOperator().consumesReferenceBases())
+ {
+ refnext += length;
+ }
+ if (el.getOperator().consumesReadBases())
+ {
+ next += length;
+ }
+ if (el.getOperator().equals(CigarOperator.I))
+ {
+ // count how many insertions we made
+ lastinserts = length;
+ }
+ }
+ return newRead.toString();
+ }
+
+ /**
+ * Apply a cigar operation to a segment of a read, accounting for any
+ * insertions to the reference from other reads in the region
+ *
+ * @param el
+ * the current CIGAR element
+ * @param next
+ * location in read to start applying CIGAR op
+ * @param length
+ * length of CIGAR op
+ * @param rec
+ * SAMRecord corresponding to read
+ * @param it
+ * iterator over insertions which coincide with rec
+ * @param override
+ * an optional <insertion position, length> which can override a
+ * <position,length> in it to change the length. Set to null if not
+ * used.
+ * @param seq
+ * @return
+ */
+ private String applyCigarOp(CigarElement el, int next,
+ SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
+ int[] override, SequenceI seq)
+ {
+ StringBuilder newRead = new StringBuilder();
+ String read = rec.getReadString();
+
+ int length = el.getLength();
+ int nextPos = next;
+
+ switch (el.getOperator())
+ {
+ case M:
+ case X:
+ case EQ:
+ // matched or mismatched residues
+ newRead = applyCigarConsumeReadAndRef(next, length, rec, it,
+ override, seq);
+ break;
+ case N: // intron in RNA
+ case D: // deletion
+ int insertCount = 0;
+ while (it.hasNext())
+ {
+ // just count all the insertions we need to add in the deletion/intron
+ // region
+ Entry<Integer, Integer> entry = it.next();
+ insertCount += entry.getValue();
+ }
+ addGaps(newRead, length + insertCount);
+ break;
+ case I:
+ // the reference sequence and other reads should have been gapped for
+ // this insertion, so just add in the residues
+ newRead.append(read.substring(nextPos, nextPos + length));
+ seq.addSequenceFeature(new SequenceFeature("INSERTION", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
+ case S:
+ // soft clipping - just skip this bit of the read
+ seq.addSequenceFeature(new SequenceFeature("SOFTCLIP", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
+ case H:
+ // hard clipping - this stretch will not appear in the read
+ seq.addSequenceFeature(new SequenceFeature("HARDCLIP", "",
+ nextPos + 1, nextPos + length, "bamfile"));
+ break;
+ default:
+ break;
+ }
+
+ return newRead.toString();
+ }
+
+ /**
+ * Applies a CIGAR operation which consumes both read and reference i.e. M,=,X
+ *
+ * @param next
+ * next position in read
+ * @param length
+ * length of cigar operation
+ * @param rec
+ * SAMRecord to apply the CIGAR op to
+ * @param it
+ * iterator over insertions in the CIGAR region
+ * @param override
+ * insertions which have already been accounted for
+ * @param seq
+ * @return
+ */
+ private StringBuilder applyCigarConsumeReadAndRef(int next, int length,
+ SAMRecord rec, Iterator<Map.Entry<Integer, Integer>> it,
+ int[] override, SequenceI seq)
+ {
+ StringBuilder newRead = new StringBuilder();
+ String read = rec.getReadString();
+
+ int nextPos = next; // location of next position in read
+ while (nextPos < next + length)
+ {
+ int insertPos = next + length; // location of next insert in read (or end
+ // of read)
+ int insertLen = 0; // no of gaps to insert after insertPos
+ if (it.hasNext())
+ {
+ // account for sequence already having insertion
+ // if an override entry exists it gives the number of positions still to
+ // be inserted at this point, after accounting for insertions driven by
+ // the current read.
+ // e.g. if the current read has 2 inserted positions here, and the
+ // insertions list also has 2 inserted positions, the override will
+ // cancel the insert for this read. If the insertions list had 3
+ // insertions, the override would reduce this to 1 insertion.
+ Entry<Integer, Integer> entry = it.next();
+ int key = entry.getKey();
+ insertLen = entry.getValue();
+ if (override != null && key == override[0])
+ {
+ insertLen = override[1];
+ }
+ if (insertLen > 0)
+ {
+ insertPos = rec.getReadPositionAtReferencePosition(key) - 1;
+ }
+ else
+ {
+ insertPos = nextPos; // bugfix - trigger by override + insertions in
+ // same M stretch
+ }
+ }
+ newRead.append(read.substring(nextPos, insertPos));
+ nextPos = insertPos;
+ addGaps(newRead, insertLen); // add insertions
+ }
+ return newRead;
+ }
+
+ /**
+ * Get list of positions inserted to the reference sequence. Note: assumes all
+ * reads are on the same chromosome.
+ *
+ * @param it
+ * Iterator over all the SAMRecords
+ * @return sorted map of insertion positions <position, insertion size>
+ */
+ /*
+ 1234567
+ ABCDEFG insert 3,4
+
+ R1: insert 3 = gap before 3rd res
+ AB.CDEFG
+
+ R2: insert 4 = gap before 4th res
+ ABC.DEFG
+
+ R3: insert 3,4 = 2 gaps before 3rd res
+ AB..CDEFG
+
+ => AB--C-DEFG
+
+ So need to store insertions as (pos, length) giving here (3,1),(4,1),(3,2) - then can sub longest length at position so (3,2), (4,1)
+ Then when inserting need to include previously inserted count AND determine if sequence already has an insertion or needs gap(s) added
+
+ */
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it)
+ {
+ return getInsertions(it, null);
+ }
+
+ public int firstAlColumn = -1, firstRposCol = -1;
+
+ public SortedMap<Integer, Integer>[] getInsertions(Iterator<SAMRecord> it,
+ Range[] extent)
+ {
+ SortedMap<Integer, Integer> sinserts[] = new SortedMap[] {
+ new TreeMap<>(), new TreeMap<>() };
+ Range xten = extent != null && extent[0] != null ? extent[0] : null;
+ do
+ {
+ // check extent of read
+ SAMRecord rec = it.next();
+ if (extent != null)
+ {
+
+ int nstart = rec.getAlignmentStart();
+ if (nstart != 0)
+ {
+ nstart = rec.getReferencePositionAtReadPosition(nstart);
+ }
+ int nend = rec.getAlignmentEnd();
+ if (nend != 0)
+ {
+ nend = rec.getReferencePositionAtReadPosition(nend);
+ }
+
+ xten = new Range(
+ nstart <= 0 ? xten.start : Math.min(nstart, xten.start),
+ nend == 0 ? xten.end : Math.max(nend, xten.end));
+ }
+
+ // check each record for insertions in the CIGAR string
+
+ // pick the insert map to update
+ int insmap = rec.getReadNegativeStrandFlag() ? 1 : 0;
+ SortedMap<Integer, Integer> inserts = sinserts[insmap];
+
+ Iterator<CigarElement> cit = rec.getCigar().getCigarElements()
+ .iterator();
+ int refLocation = rec.getAlignmentStart();
+ int rloc = 0;
+ int alcol = 0;
+ int alpos = 0;
+ while (cit.hasNext())
+ {
+ CigarElement el = cit.next();
+ switch (el.getOperator())
+ {
+ case S:
+ case H:
+ // ignore soft/hardclipped
+
+ break;
+ default:
+ if (el.getOperator().consumesReadBases()
+ && !el.getOperator().consumesReferenceBases())
+ {
+ // add to insertions list, and move along the read
+ // location is the start of next CIGAR segment
+ // because getReferencePositionAtReadPosition returns 0 for read
+ // positions which are not in the reference (like insertions)
+
+ // if there's already an insertion at this location, keep the
+ // longest
+ // insertion; if there's no insertion keep this one
+ if (!inserts.containsKey(refLocation)
+ || (inserts.containsKey(refLocation)
+ && inserts.get(refLocation) < el.getLength()))
+ {
+ inserts.put(refLocation, el.getLength());
+ }
+ break;
+ }
+ if (el.getOperator().consumesReferenceBases())
+ {
+ if (el.getOperator().consumesReadBases() && alcol == 0
+ && refLocation + 1 > extent[0].start)
+ {
+ // first match position : last rloc + first match op
+ alcol = rloc + 1;
+ alpos = rec.getReferencePositionAtReadPosition(rloc + 1);
+ }
+ refLocation += el.getLength();
+ }
+
+ }
+ if (el.getOperator().consumesReadBases()
+ || el.getOperator().consumesReferenceBases())
+ {
+ rloc += el.getLength();
+ }
+ }
+ // Wrong: hack that fails to relocate reference to correct place in
+ // sequence
+ if (firstAlColumn < 0)
+ {
+ firstAlColumn = alcol;
+ firstRposCol = alpos;
+ }
+ } while (it.hasNext());
+ if (xten != null)
+ {
+ extent[0] = xten;
+ }
+ return sinserts;
+ }
+
+ /**
+ * Add sequence of gaps to a read
+ *
+ * @param read
+ * read to update
+ * @param length
+ * number of gaps
+ */
+ private void addGaps(StringBuilder read, int length)
+ {
+ if (length > 0)
+ {
+ char[] gaps = new char[length];
+ Arrays.fill(gaps, gapChar);
+ read.append(gaps);
+ }
+ }
+
+ /**
+ * Get the number of insertions before the start of an aligned read (i.e.
+ * insertions in soft clipped region are counted too)
+ *
+ * @param rec
+ * the SAMRecord for the read
+ * @param inserts
+ * the map of insertions
+ * @return number of insertions before the read starts
+ */
+ private int countInsertionsBeforeRead(SAMRecord rec,
+ SortedMap<Integer, Integer> inserts)
+ {
+ int gaps = 0;
+
+ // add in any insertion gaps before read
+ // although we only need to get the submap from alignmentStart, there won't
+ // be any insertions before that so we can just start at 0
+ SortedMap<Integer, Integer> seqInserts = inserts.subMap(0,
+ rec.getStart());
+
+ Iterator<Map.Entry<Integer, Integer>> it = seqInserts.entrySet()
+ .iterator();
+ while (it.hasNext())
+ {
+ Entry<Integer, Integer> entry = it.next();
+ gaps += entry.getValue();
+ }
+ return gaps;
+ }
+}
}
/**
- * Create a cigar object from a cigar string like '[<I|D|M><range>]+' Will
- * fail if the given seq already contains gaps (JBPNote: future implementation
- * will fix)
- *
- * @param seq
- * SequenceI object resolvable to a dataset sequence
- * @param cigarString
- * String
- * @return Cigar
- */
- public static SeqCigar parseCigar(SequenceI seq, String cigarString)
- throws Exception
- {
- Object[] opsandrange = parseCigarString(cigarString);
- return new SeqCigar(seq, (char[]) opsandrange[0],
- (int[]) opsandrange[1]);
- }
-
- /**
* create an alignment from the given array of cigar sequences and gap
* character, and marking the given segments as visible in the given
* hiddenColumns.
--- /dev/null
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
+package jalview.gui;
+
+import jalview.io.AlignmentFileReaderI;
+import jalview.io.BamFile;
+import jalview.util.MessageManager;
+
+import java.awt.GridBagConstraints;
+import java.awt.GridBagLayout;
+import java.util.Arrays;
+
+import javax.swing.BorderFactory;
+import javax.swing.JComboBox;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JTextField;
+
+/**
+ * A dialog where a user can choose import options for a bam file
+ */
+public class BamFileOptionsChooser extends JPanel
+{
+
+ // text field for start of range
+ // JFormattedTextField startRange = new JFormattedTextField(
+ // new NumberFormatter(NumberFormat.getIntegerInstance()));
+
+ // text field for end of range
+ // JFormattedTextField endRange = new JFormattedTextField(
+ // new NumberFormatter(NumberFormat.getIntegerInstance()));
+
+ JTextField startRange = new JTextField(10);
+
+ JTextField endRange = new JTextField(10);
+
+ // combo box for chromosome list
+ JComboBox<String> chroCombo;
+
+ public BamFileOptionsChooser(AlignmentFileReaderI source)
+ {
+ Object[] preprocessResult = source.preprocess();
+ String[] chromosomes = Arrays.copyOf(preprocessResult,
+ preprocessResult.length, String[].class);
+
+ setLayout(new GridBagLayout());
+ GridBagConstraints c = new GridBagConstraints();
+
+ setBorder(BorderFactory.createEmptyBorder(0, 10, 10, 10));
+
+ // set up chromosome input
+ JLabel chroLabel = new JLabel(
+ MessageManager.getString("label.bam_chromosome"));
+ chroCombo = new JComboBox<>(chromosomes);
+
+ // set up region input
+ JPanel range = new JPanel();
+ JLabel rangeLabel = new JLabel(
+ MessageManager.getString("label.bam_range"));
+ startRange.setColumns(8);
+ endRange.setColumns(8);
+
+ JLabel rangeSep = new JLabel("-");
+ range.add(startRange);
+ range.add(rangeSep);
+ range.add(endRange);
+
+ // chromosome label top left
+ c.gridx = 0;
+ c.gridy = 0;
+ c.anchor = GridBagConstraints.WEST;
+ c.fill = GridBagConstraints.NONE;
+ this.add(chroLabel, c);
+
+ // chromosome combo top right
+ c.gridx = 1;
+ c.weightx = 1;
+ c.anchor = GridBagConstraints.WEST;
+ c.fill = GridBagConstraints.HORIZONTAL;
+ this.add(chroCombo, c);
+
+ // region label mid left
+ c.gridx = 0;
+ c.gridy = 1;
+ c.anchor = GridBagConstraints.WEST;
+ c.fill = GridBagConstraints.NONE;
+ this.add(rangeLabel, c);
+
+ // region input mid right
+ c.gridx = 1;
+ c.weightx = 1;
+ c.anchor = GridBagConstraints.WEST;
+ c.fill = GridBagConstraints.HORIZONTAL;
+ this.add(range, c);
+ }
+
+ public void update(AlignmentFileReaderI source)
+ {
+
+ int start = Integer.parseInt(startRange.getText());
+ int end = Integer.parseInt(endRange.getText());
+ ((BamFile) source).setOptions((String) chroCombo.getSelectedItem(),
+ start, end);
+
+ }
+
+}
*/
protected void initData()
{
- seqs = new Vector<SequenceI>();
- annotations = new Vector<AlignmentAnnotation>();
- seqGroups = new ArrayList<SequenceGroup>();
+ seqs = new Vector<>();
+ annotations = new Vector<>();
+ seqGroups = new ArrayList<>();
parseCalled = false;
}
@Override
public void setSeqs(SequenceI[] s)
{
- seqs = new Vector<SequenceI>();
+ seqs = new Vector<>();
for (int i = 0; i < s.length; i++)
{
{
if (newickStrings == null)
{
- newickStrings = new Vector<String[]>();
+ newickStrings = new Vector<>();
}
newickStrings.addElement(new String[] { treeName, newickString });
}
{
seqs.add(seq);
}
+
+ @Override
+ public Object[] preprocess()
+ {
+ // most AlignFiles will not need to return any preprocessing information
+ // those that do should override this method
+ return null;
+ }
+
}
*/
package jalview.io;
-import jalview.api.AlignExportSettingI;
-import jalview.api.AlignmentViewPanel;
import jalview.api.FeatureSettingsModelI;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.SequenceI;
+import java.io.IOException;
+
public interface AlignmentFileReaderI
{
FeatureSettingsModelI getFeatureColourScheme();
+ /**
+ * Get a string array resulting from preprocessing the file
+ *
+ * @return preprocess result
+ */
+ Object[] preprocess();
+
+ public abstract void parse() throws IOException;
+
}
public AlignmentI readFile(String file, DataSourceType sourceType,
FileFormatI fileFormat) throws IOException
{
+ if (alignFile == null)
+ {
+ prepareFileReader(file, sourceType, fileFormat);
+ }
+ else
+ {
+ alignFile.parse();
+ }
+ return buildAlignmentFromFile();
+ }
+
+ public void prepareFileReader(String file, DataSourceType sourceType,
+ FileFormatI fileFormat) throws IOException
+ {
this.inFile = file;
try
{
}
else
{
- // alignFile = fileFormat.getAlignmentFile(inFile, sourceType);
alignFile = fileFormat.getReader(new FileParse(inFile, sourceType));
}
- return buildAlignmentFromFile();
+ return;
} catch (Exception e)
{
e.printStackTrace();
// Possible sequence is just residues with no label
alignFile = new FastaFile(">UNKNOWN\n" + inFile,
DataSourceType.PASTE);
- return buildAlignmentFromFile();
+ return;
} catch (Exception ex)
{
--- /dev/null
+/*
+ * Jalview - A Sequence Alignment Editor and Viewer ($$Version-Rel$$)
+ * Copyright (C) $$Year-Rel$$ The Jalview Authors
+ *
+ * This file is part of Jalview.
+ *
+ * Jalview is free software: you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, either version 3
+ * of the License, or (at your option) any later version.
+ *
+ * Jalview is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Jalview. If not, see <http://www.gnu.org/licenses/>.
+ * The Jalview Authors are detailed in the 'AUTHORS' file.
+ */
+package jalview.io;
+
+import jalview.datamodel.CigarParser;
+import jalview.datamodel.Range;
+import jalview.datamodel.Sequence;
+import jalview.datamodel.SequenceFeature;
+import jalview.datamodel.SequenceI;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.PrimitiveIterator.OfInt;
+import java.util.SortedMap;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMRecordIterator;
+import htsjdk.samtools.SAMSequenceRecord;
+import htsjdk.samtools.SamInputResource;
+import htsjdk.samtools.SamReader;
+import htsjdk.samtools.SamReaderFactory;
+import htsjdk.samtools.ValidationStringency;
+
+public class BamFile extends AlignFile
+{
+ // SAM/BAM file reader
+ private SamReader fileReader;
+
+ // start position to read from
+ private int start = -1;
+
+ // end position to read to
+ private int end = -1;
+
+ // chromosome/contig to read
+ private String chromosome = "";
+
+ // first position in alignment
+ private int alignmentStart = -1;
+
+ /**
+ * Creates a new BamFile object.
+ */
+ public BamFile()
+ {
+ }
+
+ /**
+ * Creates a new BamFile object.
+ *
+ * @param inFile
+ * Name of file to read
+ * @param sourceType
+ * Whether data source is file, url or other type of data source
+ *
+ * @throws IOException
+ */
+ public BamFile(String inFile, DataSourceType sourceType)
+ throws IOException
+ {
+ super(true, inFile, sourceType);
+ final SamReaderFactory factory = SamReaderFactory.makeDefault()
+ .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
+ SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
+ .validationStringency(ValidationStringency.SILENT);
+ fileReader = factory.open(new File(inFile));
+ }
+
+ /**
+ * Creates a new BamFile object
+ *
+ * @param source
+ * wrapper for datasource
+ * @throws IOException
+ */
+ public BamFile(FileParse source) throws IOException
+ {
+ super(true, source);
+ parseSuffix();
+ final SamReaderFactory factory = SamReaderFactory.makeDefault()
+ .enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS,
+ SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
+ .validationStringency(ValidationStringency.SILENT);
+
+ // File-based bam
+ if (source.getDataSourceType() == DataSourceType.FILE)
+ {
+ fileReader = factory.open(source.inFile);
+ }
+ else
+ {
+ // locate index ?
+ String index = source.getDataName() + ".bai";
+ fileReader = factory.open(SamInputResource.of(source.getDataName())
+ .index(new URL(index)));
+ }
+ }
+
+ @Override
+ public String print(SequenceI[] seqs, boolean jvsuffix)
+ {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+ private StringBuilder insertRefSeq(
+ SortedMap<Integer, Integer> insertions, Range xtent)
+ {
+ StringBuilder refseq = new StringBuilder();
+ int inserted = 0;
+ for (int p = xtent.start; p < xtent.end; p++)
+ {
+ refseq.append('N');
+ }
+ for (Map.Entry<Integer, Integer> insert : insertions.entrySet())
+ {
+ int inspos = insert.getKey() - xtent.start + inserted;
+ if (inspos < 0)
+ {
+ System.out.println("Ignoring -ve insert position " + insert.getKey()
+ + " of " + insert.getValue() +
+ " (alpos: " + inspos + ")");
+ inspos = 0;
+ // continue;
+ }
+ for (int i = 0, j = insert.getValue(); i < j; i++)
+ {
+ inserted++;
+ refseq.insert(inspos, '-');
+ }
+ }
+ return refseq;
+ }
+
+ private void padRef(SequenceI ref, CigarParser cig)
+ { // pad to column
+ int padding = cig.firstAlColumn - ref.findIndex(cig.firstRposCol);
+ System.out.println("Padding " + padding + " to move refseq position "
+ + cig.firstRposCol + " to " + cig.firstAlColumn + "col.");
+
+ ref.insertCharAt(0, padding, '-');
+ }
+
+ @Override
+ public void parse()
+ {
+ // only actually parse if params are set
+ if (chromosome != null && chromosome != "")
+ {
+ SAMRecordIterator it = fileReader.query(chromosome, start, end,
+ false);
+ CigarParser parser = new CigarParser('-');
+ Range[] xtent = new Range[] { new Range(start, end) };
+ SortedMap<Integer, Integer> insertions[] = parser.getInsertions(it,
+ xtent);
+ it.close();
+
+ SequenceI refSeq = new Sequence("chr:" + chromosome,
+ insertRefSeq(insertions[0], xtent[0])
+ .toString());
+
+ refSeq.setStart(xtent[0].start);
+ refSeq.setEnd(xtent[0].end);
+ SequenceI revRefSeq = new Sequence("rev:chr:" + chromosome,
+ insertRefSeq(insertions[1], xtent[0])
+ .toString());
+ revRefSeq.setStart(xtent[0].start);
+ revRefSeq.setEnd(xtent[0].end);
+
+ // Hack to move the seqs along
+ padRef(refSeq, parser);
+ padRef(revRefSeq, parser);
+
+ it = fileReader.query(chromosome, start, end, false);
+
+ ArrayList<SequenceI> fwd = new ArrayList(), rev = new ArrayList();
+ while (it.hasNext())
+ {
+ SAMRecord rec = it.next();
+
+ // set the alignment start to be start of first read (we assume reads
+ // are sorted)
+ if (alignmentStart == -1)
+ {
+ alignmentStart = rec.getAlignmentStart();
+ }
+
+ // make dataset sequence: start at 1, end at read length
+ SequenceI seq = new Sequence(
+ "" + (rec.getReadNegativeStrandFlag() ? "rev:" : "")
+ + rec.getReadName(),
+ rec.getReadString().toLowerCase());
+ seq.setStart(1);
+ seq.setEnd(rec.getReadLength());
+ OfInt q = rec.getBaseQualityString().chars()
+ .iterator();
+ int p = seq.getStart();
+ while (q.hasNext())
+ {
+ seq.addSequenceFeature(new SequenceFeature("QUALITY", "", p, p,
+ (float) q.next() - ' ', "bamfile"));
+ p++;
+ }
+ String newRead = parser.parseCigarToSequence(rec,
+ insertions[rec.getReadNegativeStrandFlag() ? 1 : 0],
+ alignmentStart, seq);
+
+ // make alignment sequences
+ SequenceI alsq = seq.deriveSequence();
+ alsq.setSequence(newRead);
+
+ // set start relative to soft clip; assume end is set by Sequence code
+ alsq.setStart(rec.getStart() - rec.getUnclippedStart() + 1);
+ if (rec.getReadNegativeStrandFlag())
+ {
+ rev.add(alsq);
+ }
+ else
+ {
+ fwd.add(alsq);
+ }
+ }
+ // Add forward
+ if (fwd.size() > 0)
+ {
+ seqs.add(refSeq); // FIXME needs to be optional, and properly padded
+ seqs.addAll(fwd);
+ fwd.clear();
+ }
+ // and reverse strand reads.
+ if (rev.size() > 0)
+ {
+ seqs.add(revRefSeq); // FIXME needs to be optional and properly padded
+ seqs.addAll(rev);
+ rev.clear();
+ }
+ }
+ }
+
+ /**
+ * Get the list of chromosomes or contigs from the file (listed in SQ entries
+ * in BAM file header)
+ *
+ * @return array of chromosome/contig strings
+ */
+ @Override
+ public Object[] preprocess()
+ {
+ List<SAMSequenceRecord> refSeqs = fileReader.getFileHeader()
+ .getSequenceDictionary().getSequences();
+ List<String> chrs = new ArrayList<>();
+
+ for (SAMSequenceRecord ref : refSeqs)
+ {
+ chrs.add(ref.getSequenceName());
+ }
+ return chrs.toArray();
+ }
+
+ public void setOptions(String chr, int s, int e)
+ {
+ chromosome = chr;
+ start = s;
+ end = e;
+ suffix = chromosome + ":" + start + "-" + end;
+ }
+
+ public boolean parseSuffix()
+ {
+ if (suffix == null)
+ {
+ return false;
+ }
+ int csep = suffix.indexOf(":");
+ int rsep = suffix.indexOf("-", csep);
+ if (csep < 0 || rsep < 0 || suffix.length() - rsep <= 1)
+ {
+ return false;
+ }
+ String chr, p1, p2;
+ chr = suffix.substring(0, csep);
+ p1 = suffix.substring(csep + 1, rsep);
+ p2 = suffix.substring(rsep + 1);
+ int cstart, cend;
+ try
+ {
+ cstart = Integer.parseInt(p1);
+ cend = Integer.parseInt(p2);
+ } catch (Exception e)
+ {
+ warningMessage = (warningMessage == null ? "" : warningMessage + "\n")
+ + "Couldn't parse range from " + suffix;
+ return false;
+ }
+ chromosome = chr;
+ start = cstart;
+ end = cend;
+ return true;
+ }
+}
return true;
}
},
+ Bam("bam", "bam", true, true)
+ {
+ @Override
+ public AlignmentFileReaderI getReader(FileParse source)
+ throws IOException
+ {
+ return new BamFile(source);
+ }
+
+ @Override
+ public AlignmentFileWriterI getWriter(AlignmentI al)
+ {
+ return new BamFile();
+ }
+
+ @Override
+ public boolean isStructureFile()
+ {
+ return false;
+ }
+ },
Jalview("Jalview", "jar,jvp", true, true)
{
@Override
import jalview.datamodel.SequenceI;
import jalview.gui.AlignFrame;
import jalview.gui.AlignViewport;
+import jalview.gui.BamFileOptionsChooser;
import jalview.gui.Desktop;
import jalview.gui.Jalview2XML;
import jalview.gui.JvOptionPane;
loadtime = -System.currentTimeMillis();
AlignmentI al = null;
+ if (FileFormat.Bam.equals(format))
+ {
+ FormatAdapter fa = new FormatAdapter();
+ if (source == null)
+ {
+ fa.prepareFileReader(file, protocol, format);
+ source = fa.getAlignFile();
+ }
+ if (!((BamFile) source).parseSuffix())
+ {
+ // configure a window
+ BamFileOptionsChooser bamoptions = new BamFileOptionsChooser(
+ source);
+ // ask the user which bit of the bam they want to load
+ int confirm = JvOptionPane.showConfirmDialog(null, bamoptions,
+ MessageManager.getString("label.bam_file_options"),
+ JvOptionPane.OK_CANCEL_OPTION,
+ JvOptionPane.PLAIN_MESSAGE);
+
+ if (confirm == JvOptionPane.CANCEL_OPTION
+ || confirm == JvOptionPane.CLOSED_OPTION)
+ {
+ Desktop.instance.stopLoading();
+ return;
+ }
+ else
+ {
+ bamoptions.update(source);
+ if (file.indexOf("#") == -1)
+ {
+ file = file + "#" + ((BamFile) source).suffix;
+ }
+ }
+ }
+ al = fa.readFile(file, protocol, format);
+ }
+
if (FileFormat.Jalview.equals(format))
{
if (source != null)
String error = AppletFormatAdapter.getSupportedFormats();
try
{
- if (source != null)
- {
- // read from the provided source
- al = new FormatAdapter().readFromFile(source, format);
- }
- else
+ if (al == null)
{
-
- // open a new source and read from it
- FormatAdapter fa = new FormatAdapter();
- boolean downloadStructureFile = format.isStructureFile()
- && protocol.equals(DataSourceType.URL);
- if (downloadStructureFile)
+ if (source != null)
{
- String structExt = format.getExtensions().split(",")[0];
- String urlLeafName = file.substring(
- file.lastIndexOf(
- System.getProperty("file.separator")),
- file.lastIndexOf("."));
- String tempStructureFileStr = createNamedJvTempFile(
- urlLeafName, structExt);
- UrlDownloadClient.download(file, tempStructureFileStr);
- al = fa.readFile(tempStructureFileStr, DataSourceType.FILE,
- format);
- source = fa.getAlignFile();
+ // read from the provided source
+ al = new FormatAdapter().readFromFile(source, format);
}
else
{
- al = fa.readFile(file, protocol, format);
- source = fa.getAlignFile(); // keep reference for later if
- // necessary.
+
+ // open a new source and read from it
+ FormatAdapter fa = new FormatAdapter();
+ boolean downloadStructureFile = format.isStructureFile()
+ && protocol.equals(DataSourceType.URL);
+ if (downloadStructureFile)
+ {
+ String structExt = format.getExtensions().split(",")[0];
+ String urlLeafName = file.substring(
+ file.lastIndexOf(
+ System.getProperty("file.separator")),
+ file.lastIndexOf("."));
+ String tempStructureFileStr = createNamedJvTempFile(
+ urlLeafName, structExt);
+ UrlDownloadClient.download(file, tempStructureFileStr);
+ al = fa.readFile(tempStructureFileStr, DataSourceType.FILE,
+ format);
+ source = fa.getAlignFile();
+ }
+ else
+ {
+ al = fa.readFile(file, protocol, format);
+ source = fa.getAlignFile(); // keep reference for later if
+ // necessary.
+ }
}
}
} catch (java.io.IOException ex)
checkURLSource(fileStr);
if (suffixSeparator == '#')
{
- extractSuffix(fileStr); // URL lref is stored for later reference.
+ dataName = extractSuffix(fileStr); // URL lref is stored for later
+ // reference.
}
} catch (IOException e)
{
source.getDataSourceType());
return readFromFile(fp, format);
}
-
}
{
// jar files are special - since they contain all sorts of random
// characters.
- if (source.inFile != null)
+ if (source.inFile != null || source.getDataName() != null)
{
- String fileStr = source.inFile.getName();
+ String fileStr = source.inFile == null ? source.getDataName()
+ : source.inFile.getName();
// possibly a Jalview archive.
if (fileStr.lastIndexOf(".jar") > -1
|| fileStr.lastIndexOf(".zip") > -1)
{
reply = FileFormat.Jalview;
+ // TODO shouldn't there be a break here?
+ }
+ else if (fileStr.lastIndexOf(".bam") > -1)
+ {
+ reply = FileFormat.Bam;
+ break;
}
}
if (!lineswereskipped && data.startsWith("PK"))
--- /dev/null
+package jalview.datamodel;
+
+import java.util.Iterator;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.testng.Assert;
+import org.testng.annotations.BeforeClass;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import htsjdk.samtools.SAMRecord;
+import htsjdk.samtools.SAMRecordSetBuilder;
+
+public class CigarParserTest
+{
+ @BeforeClass(alwaysRun = true)
+ public void setup()
+ {
+
+
+
+ }
+
+ @DataProvider(name = "reads")
+ public Object[][] createReadsData()
+ {
+ SortedMap<Integer, Integer> noinsertions = new TreeMap<>();
+
+ SortedMap<Integer, Integer> insertions = new TreeMap<>();
+ insertions.put(8, 3);
+ insertions.put(105, 2);
+
+ SortedMap<Integer, Integer> insertions2 = new TreeMap<>();
+ insertions2.put(11, 2);
+
+ SortedMap<Integer, Integer> insertions3 = new TreeMap<>();
+ insertions3.put(8, 3);
+ insertions3.put(105, 3);
+
+ SortedMap<Integer, Integer> insertions4 = new TreeMap<>();
+ insertions4.put(8, 3);
+ insertions4.put(105, 2);
+ insertions4.put(109, 3);
+ insertions4.put(112, 1);
+
+ String read = "CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC";
+
+ return new Object[][] { { "1S84M2I14M", read, 21,
+ "-----------------------GAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC",
+ insertions }, // first residue is G (accounting for C soft clip) at
+ // position 21 + 3 (insertions at position 8)
+ { "1S84M2I14M", read, 21,
+ "-----------------------GAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGA-GGAGCTCGTTGGTC",
+ insertions3 }, // read has 2 insertions accounted for in
+ // insertions3, 3rd insertion is added as gap at
+ // position 105
+ { "1S84M2I14M", read, 21,
+ "-----------------------GAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAG---CTC-GTTGGTC",
+ insertions4 }, // 2 insertions in read accounted for at position
+ // 105; 3 insertions at 109 and 1 insertion at 112
+ { "44M1D57M",
+ read,
+ 3,
+ "--CGAAG---CTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTG-AAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC",
+ insertions },
+ { "101M",
+ read, 4,
+ "---CGAA---GCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC",
+ insertions },
+ { "6M2D76M19S",
+ "CGAAGCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTCCC",
+ 4,
+ "---CGAAGC----TTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGA",
+ insertions2 },
+
+ { "44M1D57M",
+ read,
+ 3,
+ "--CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTG-AAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC",
+ noinsertions },
+ { "101M",
+ read, 4,
+ "---CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC",
+ noinsertions },
+ { "5S96M", read, 7,
+ "------CTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGTTGGTC",
+ noinsertions },
+ { "96M5H", read, 7,
+ "------CGAAGCTCTTTACCCGGAAACCATTGAAATCGGACGGTTTAGTGAAATGGAGGATCAAGTTGGGTTTGGGTTCCGTCCGAACGACGAGGAGCTCGT",
+ noinsertions }, };
+ }
+
+ @Test(dataProvider = "reads", groups = { "Functional" })
+ public void testParse(String cigar, String read, int start, String result,
+ SortedMap<Integer, Integer> insertions)
+ {
+ SAMRecord rec = new SAMRecord(null);
+ rec.setCigarString(cigar);
+ rec.setReadString(read);
+ rec.setAlignmentStart(start);
+
+ CigarParser cp = new CigarParser('-');
+ String bfresult = cp.parseCigarToSequence(rec, insertions, 1);
+
+ System.out.println(result);
+ System.out.println(bfresult);
+ Assert.assertEquals(bfresult, result);
+ }
+
+ @Test(groups = { "Functional" })
+ public void testGetInsertions()
+ {
+ final SAMRecordSetBuilder builder = new SAMRecordSetBuilder();
+ builder.addFrag("read_1", 22, 30000, false, false,
+ "101M", "", 0);
+ builder.addFrag("read_2", 22, 28835, false, false,
+ "50M3I48M", "", 0);
+ builder.addFrag("read_3", 22, 28835, false, false, "3M1I75M2I1M", "",
+ 0);
+ builder.addFrag("read_4", 22, 28865, false, false, "48M3I49M", "", 0);
+ builder.addFrag("read_5", 22, 28865, false, false, "49M3I47M2D2M", "",
+ 0);
+ builder.addFrag("read_6", 22, 27000, false, false, "2M4I90M5S", "", 0);
+ builder.addFrag("read_7", 22, 27000, false, false, "2M1I98M", "", 0);
+
+ builder.addFrag("read_8", 22, 27000, false, false, "3M200N2I5M", "", 0);
+
+ Iterator<SAMRecord> it = builder.iterator();
+ CigarParser cp = new CigarParser('-');
+ SortedMap<Integer, Integer> insertions = cp.getInsertions(it);
+ Assert.assertEquals(insertions.size(), 6);
+ Assert.assertTrue(insertions.containsKey(28838));
+ Assert.assertEquals((int) insertions.get(28838), 1);
+ Assert.assertTrue(insertions.containsKey(28885));
+ Assert.assertEquals((int) insertions.get(28885), 3);
+ Assert.assertTrue(insertions.containsKey(28913));
+ Assert.assertEquals((int) insertions.get(28913), 3);
+ Assert.assertTrue(insertions.containsKey(28914));
+ Assert.assertEquals((int) insertions.get(28914), 3);
+ Assert.assertTrue(insertions.containsKey(27002));
+ Assert.assertEquals((int) insertions.get(27002), 4);
+ Assert.assertTrue(insertions.containsKey(27203));
+ Assert.assertEquals((int) insertions.get(27203), 2);
+ }
+}
assertEquals("Failed to recover ungapped sequence cigar operations",
"42M", cs_null);
testCigar_string(s_gapped, ex_cs_gapped);
- SeqCigar gen_sgapped = SeqCigar.parseCigar(s, ex_cs_gapped);
- assertEquals("Failed parseCigar", ex_cs_gapped,
- gen_sgapped.getCigarstring());
-
- testSeqRecovery(gen_sgapped, s_gapped);
/*
* Test dataset resolution
<subpackage name="datamodel">
<disallow pkg="jalview.gui"/>
<allow pkg="fr.orsay.lri.varna"/>
+ <allow pkg="htsjdk"/>
<subpackage name="xdb.embl">
<allow pkg="org.exolab.castor"/>
</subpackage>
<allow pkg="uk.ac.vamsas"/>
<allow pkg="fr.orsay.lri.varna"/>
<allow pkg="MCview"/>
+ <allow pkg="htsjdk.samtools"/>
</subpackage>
<subpackage name="javascript">