From d71391d37943d1f6a5bf0af74996229f4d290cb0 Mon Sep 17 00:00:00 2001 From: tva Date: Fri, 5 Jul 2019 15:59:00 +0100 Subject: [PATCH] JAL-2629 uniquify all hmmer command sequence outputs --- src/jalview/hmmer/HMMAlign.java | 2 +- src/jalview/hmmer/HMMBuild.java | 3 ++- src/jalview/hmmer/HMMSearch.java | 12 ++++++--- src/jalview/hmmer/HmmerCommand.java | 50 ++++++++++++++++++++++++----------- src/jalview/hmmer/JackHMMER.java | 12 ++++++--- 5 files changed, 54 insertions(+), 25 deletions(-) diff --git a/src/jalview/hmmer/HMMAlign.java b/src/jalview/hmmer/HMMAlign.java index 13bbba4..d66ec33 100644 --- a/src/jalview/hmmer/HMMAlign.java +++ b/src/jalview/hmmer/HMMAlign.java @@ -87,7 +87,7 @@ public class HMMAlign extends HmmerCommand File alignmentFile = FileUtils.createTempFile("output", ".sto"); File resultFile = FileUtils.createTempFile("input", ".sto"); - exportStockholm(seqs, alignmentFile.getAbsoluteFile(), null, false); + exportStockholm(seqs, alignmentFile.getAbsoluteFile(), null); exportHmm(hmm, modelFile.getAbsoluteFile()); boolean ran = runCommand(modelFile, alignmentFile, resultFile); diff --git a/src/jalview/hmmer/HMMBuild.java b/src/jalview/hmmer/HMMBuild.java index 85abd50..7c14858 100644 --- a/src/jalview/hmmer/HMMBuild.java +++ b/src/jalview/hmmer/HMMBuild.java @@ -199,10 +199,11 @@ public class HMMBuild extends HmmerCommand // TODO rather than copy alignment data we should anonymize in situ - // export/File import could use anonymization hash to reinstate references // at import level ? + SequenceI[] copyArray = copy.toArray(new SequenceI[copy.size()]); Hashtable sequencesHash = stashSequences(copyArray); - exportStockholm(copyArray, alignmentFile, ac, false); + exportStockholm(copyArray, alignmentFile, ac); recoverSequences(sequencesHash, copy.toArray(new SequenceI[] {})); diff --git a/src/jalview/hmmer/HMMSearch.java b/src/jalview/hmmer/HMMSearch.java index b40b079..f714afc 100644 --- a/src/jalview/hmmer/HMMSearch.java +++ b/src/jalview/hmmer/HMMSearch.java @@ -25,6 +25,7 @@ import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Hashtable; import java.util.List; import java.util.Scanner; @@ -44,6 +45,8 @@ public class HMMSearch extends HmmerCommand private String databaseName; + Hashtable sequencesHash; + /** * Constructor for the HMMSearchThread * @@ -277,12 +280,10 @@ public class HMMSearch extends HmmerCommand AlignmentI copy = new Alignment(al); deleteHmmSequences(copy); - SequenceI[] seqs = copy.getSequencesArray(); + sequencesHash = stashSequences(copy.getSequencesArray()); - // hmmsearch fails if duplicate sequence names in file - renameDuplicates(seqs); + exportStockholm(copy.getSequencesArray(), databaseFile, null); - exportStockholm(copy.getSequencesArray(), databaseFile, null, true); } args.add(getFilePath(hmmFile, true)); @@ -312,6 +313,9 @@ public class HMMSearch extends HmmerCommand StockholmFile file = new StockholmFile(new FileParse( inputAlignmentTemp.getAbsolutePath(), DataSourceType.FILE)); seqs = file.getSeqsAsArray(); + + recoverSequences(sequencesHash, seqs); + // look for PP cons and ref seq in alignment only annotation AlignmentAnnotation modelpos = null, ppcons = null; for (AlignmentAnnotation aa : file.getAnnotations()) diff --git a/src/jalview/hmmer/HmmerCommand.java b/src/jalview/hmmer/HmmerCommand.java index dd6adc8..6eb294f 100644 --- a/src/jalview/hmmer/HmmerCommand.java +++ b/src/jalview/hmmer/HmmerCommand.java @@ -236,7 +236,7 @@ public abstract class HmmerCommand implements Runnable * @throws IOException */ public void exportStockholm(SequenceI[] seqs, File toFile, - AnnotatedCollectionI annotated, boolean removeSS) + AnnotatedCollectionI annotated) throws IOException { if (seqs == null) @@ -280,17 +280,10 @@ public abstract class HmmerCommand implements Runnable for (SequenceI seq : newAl.getSequencesArray()) { - if (removeSS && seq.getAnnotation() != null) + if (seq.getAnnotation() != null) { for (AlignmentAnnotation ann : seq.getAnnotation()) { - // TODO investigate how to make hmmsearch and jackhmmer work with annotations - /* - if (ann.label.equals("Secondary Structure")) - { - seq.removeAlignmentAnnotation(ann); - } - */ seq.removeAlignmentAnnotation(ann); } } @@ -471,17 +464,33 @@ public abstract class HmmerCommand implements Runnable } } - void renameDuplicates(SequenceI[] seqs) + /** + * Sets the names of any duplicates within the given sequences to include their + * respective lengths. Deletes any duplicates that have the same name after this + * step + * + * @param seqs + */ + void renameDuplicates(AlignmentI al) { - // rename duplicate sequences, hmmsearch fails db contains duplicates + + SequenceI[] seqs = al.getSequencesArray(); + List wasRenamed = new ArrayList<>(); + + for (SequenceI seq : seqs) + { + wasRenamed.add(false); + } + for (int i = 0; i < seqs.length; i++) { - boolean renamed = false; for (int j = 0; j < seqs.length; j++) { - renamed = true; - if (seqs[i].getName().equals(seqs[j].getName()) && i != j) + if (seqs[i].getName().equals(seqs[j].getName()) && i != j + && !wasRenamed.get(j)) { + + wasRenamed.set(i, true); String range = "/" + seqs[j].getStart() + "-" + seqs[j].getEnd(); // setting sequence name to include range - to differentiate between // sequences of the same name. Currently have to include the range twice @@ -491,12 +500,23 @@ public abstract class HmmerCommand implements Runnable } } - if (renamed) + if (wasRenamed.get(i)) { String range = "/" + seqs[i].getStart() + "-" + seqs[i].getEnd(); seqs[i].setName(seqs[i].getName() + range + range); } } + + for (int i = 0; i < seqs.length; i++) + { + for (int j = 0; j < seqs.length; j++) + { + if (seqs[i].getName().equals(seqs[j].getName()) && i != j) + { + al.deleteSequence(j); + } + } + } } } diff --git a/src/jalview/hmmer/JackHMMER.java b/src/jalview/hmmer/JackHMMER.java index 58b1b75..e532ab7 100644 --- a/src/jalview/hmmer/JackHMMER.java +++ b/src/jalview/hmmer/JackHMMER.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; +import java.util.Hashtable; import java.util.List; import java.util.Scanner; @@ -39,6 +40,8 @@ public class JackHMMER extends HmmerCommand private String databaseName; + Hashtable sequencesHash; + /** * Constructor for the JackhmmerThread * @@ -79,7 +82,7 @@ public class JackHMMER extends HmmerCommand ".txt"); exportStockholm(new SequenceI[] { seq }, seqFile.getAbsoluteFile(), - null, true); + null); boolean ran = runCommand(searchOutputFile, hitsAlignmentFile, seqFile); @@ -251,10 +254,9 @@ public class JackHMMER extends HmmerCommand deleteHmmSequences(copy); - // jackhmmer fails if file contains duplicate sequence names - renameDuplicates(copy.getSequencesArray()); + sequencesHash = stashSequences(copy.getSequencesArray()); - exportStockholm(copy.getSequencesArray(), databaseFile, null, true); + exportStockholm(copy.getSequencesArray(), databaseFile, null); } args.add(getFilePath(seqFile, true)); @@ -282,6 +284,8 @@ public class JackHMMER extends HmmerCommand inputAlignmentTemp.getAbsolutePath(), DataSourceType.FILE)); seqs = file.getSeqsAsArray(); + recoverSequences(sequencesHash, seqs); + readTable(searchOutputFile); int seqCount = Math.min(seqs.length, seqsToReturn); -- 1.7.10.2