JAL-3026 srcjar files for VARNA and log4j
[jalview.git] / srcjar / fr / orsay / lri / varna / models / templates / BatchBenchmarkPrepare.java
diff --git a/srcjar/fr/orsay/lri/varna/models/templates/BatchBenchmarkPrepare.java b/srcjar/fr/orsay/lri/varna/models/templates/BatchBenchmarkPrepare.java
new file mode 100644 (file)
index 0000000..c3ab8ac
--- /dev/null
@@ -0,0 +1,98 @@
+/**
+ * File written by Raphael Champeimont
+ * UMR 7238 Genomique des Microorganismes
+ */
+package fr.orsay.lri.varna.models.templates;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+import fr.orsay.lri.varna.factories.RNAFactory;
+
+public class BatchBenchmarkPrepare {
+
+       /**
+        * We assume given directory contains a alignemnt.fasta file,
+        * of which the first sequence is the consensus structure,
+        * and the other sequences are aligned nucleotides. 
+        * The principle is to convert it to a set of secondary structure,
+        * using the following rule:
+        * - keep the same nucleotides as in original sequence
+        * - keep base pairs where both bases of the pair are non-gaps in our sequence
+        */
+       public void benchmarkAllDir(File rootdir) throws Exception {
+               File seqdir = new File(rootdir, "sequences");
+               if (!seqdir.exists()) {
+                       seqdir.mkdir();
+               }
+               
+               File templateFile = new File(rootdir, "template.xml");
+               
+               ArrayList<String> seqnames = new ArrayList<String>();
+               ArrayList<String> sequences = new ArrayList<String>();
+               BatchBenchmark.readFASTA(new File(rootdir, "alignment.fasta"), seqnames, sequences);
+               
+               BufferedWriter outbufASS = new BufferedWriter(new FileWriter(new File(rootdir, "all_secondary_structures.fasta")));
+               
+               String consensusSecStr = sequences.get(0);
+               int[] consensusSecStrInt = RNAFactory.parseSecStr(consensusSecStr);
+               
+               List<File> templates = new ArrayList<File>();
+               for (int i=1; i<seqnames.size(); i++) {
+                       String seqname = seqnames.get(i);
+                       String sequence = sequences.get(i);
+                       String sequenceUngapped = sequence.replaceAll("[\\.-]", "");
+                       System.out.println(seqname);
+                       String ss = "";
+                       String nt = "";
+                       for (int j=0; j<sequence.length(); j++) {
+                               if (sequence.charAt(j) != '.' && sequence.charAt(j) != '-') {
+                                       if (consensusSecStr.charAt(j) == '-' || consensusSecStr.charAt(j) == '.') {
+                                               nt += sequence.charAt(j);
+                                               ss += '.';
+                                       } else {
+                                               int k = consensusSecStrInt[j];
+                                               // k is the matching base, is it aligned to a base in our sequence?
+                                               if (sequence.charAt(k) != '.' && sequence.charAt(k) != '-') {
+                                                       nt += sequence.charAt(j);
+                                                       ss += consensusSecStr.charAt(j);
+                                               } else {
+                                                       nt += sequence.charAt(j);
+                                                       ss += '.';
+                                               }
+                                       }
+                               }
+                       }
+                       
+                       if (!sequenceUngapped.equals(nt)) {
+                               System.out.println(sequenceUngapped);
+                               System.out.println(nt);
+                               throw new Error("bug");
+                       }
+                       
+                       // We now have the sequence with its secondary structure.
+                       File outfile = new File(seqdir, seqname + ".dbn");
+                       BufferedWriter outbuf = new BufferedWriter(new FileWriter(outfile));
+                       outbuf.write(">" + seqname + "\n");
+                       outbuf.write(nt + "\n");
+                       outbuf.write(ss + "\n");
+                       outbuf.close();
+                       
+                       outbufASS.write(">" + seqname + "\n");
+                       outbufASS.write(ss + "\n");
+                       
+                       templates.add(templateFile);
+               }
+               
+               outbufASS.close();
+               
+       }
+       
+       public static void main(String[] args) throws Exception {
+               new BatchBenchmarkPrepare().benchmarkAllDir(new File(new File("templates"), "RNaseP_bact_a"));
+       }
+       
+}