2 * File written by Raphael Champeimont
3 * UMR 7238 Genomique des Microorganismes
5 package fr.orsay.lri.varna.models.templates;
7 import java.io.BufferedWriter;
9 import java.io.FileWriter;
10 import java.util.ArrayList;
11 import java.util.List;
13 import fr.orsay.lri.varna.factories.RNAFactory;
15 public class BatchBenchmarkPrepare {
18 * We assume given directory contains a alignemnt.fasta file,
19 * of which the first sequence is the consensus structure,
20 * and the other sequences are aligned nucleotides.
21 * The principle is to convert it to a set of secondary structure,
22 * using the following rule:
23 * - keep the same nucleotides as in original sequence
24 * - keep base pairs where both bases of the pair are non-gaps in our sequence
26 public void benchmarkAllDir(File rootdir) throws Exception {
27 File seqdir = new File(rootdir, "sequences");
28 if (!seqdir.exists()) {
32 File templateFile = new File(rootdir, "template.xml");
34 ArrayList<String> seqnames = new ArrayList<String>();
35 ArrayList<String> sequences = new ArrayList<String>();
36 BatchBenchmark.readFASTA(new File(rootdir, "alignment.fasta"), seqnames, sequences);
38 BufferedWriter outbufASS = new BufferedWriter(new FileWriter(new File(rootdir, "all_secondary_structures.fasta")));
40 String consensusSecStr = sequences.get(0);
41 int[] consensusSecStrInt = RNAFactory.parseSecStr(consensusSecStr);
43 List<File> templates = new ArrayList<File>();
44 for (int i=1; i<seqnames.size(); i++) {
45 String seqname = seqnames.get(i);
46 String sequence = sequences.get(i);
47 String sequenceUngapped = sequence.replaceAll("[\\.-]", "");
48 System.out.println(seqname);
51 for (int j=0; j<sequence.length(); j++) {
52 if (sequence.charAt(j) != '.' && sequence.charAt(j) != '-') {
53 if (consensusSecStr.charAt(j) == '-' || consensusSecStr.charAt(j) == '.') {
54 nt += sequence.charAt(j);
57 int k = consensusSecStrInt[j];
58 // k is the matching base, is it aligned to a base in our sequence?
59 if (sequence.charAt(k) != '.' && sequence.charAt(k) != '-') {
60 nt += sequence.charAt(j);
61 ss += consensusSecStr.charAt(j);
63 nt += sequence.charAt(j);
70 if (!sequenceUngapped.equals(nt)) {
71 System.out.println(sequenceUngapped);
72 System.out.println(nt);
73 throw new Error("bug");
76 // We now have the sequence with its secondary structure.
77 File outfile = new File(seqdir, seqname + ".dbn");
78 BufferedWriter outbuf = new BufferedWriter(new FileWriter(outfile));
79 outbuf.write(">" + seqname + "\n");
80 outbuf.write(nt + "\n");
81 outbuf.write(ss + "\n");
84 outbufASS.write(">" + seqname + "\n");
85 outbufASS.write(ss + "\n");
87 templates.add(templateFile);
94 public static void main(String[] args) throws Exception {
95 new BatchBenchmarkPrepare().benchmarkAllDir(new File(new File("templates"), "RNaseP_bact_a"));