Clean up the code
[proteocache.git] / datadb / compbio / cassandra / JpredParserLocalFile.java
index c37ec7a..c89fe59 100644 (file)
@@ -4,8 +4,6 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStreamReader;
-import java.net.HttpURLConnection;
-import java.net.MalformedURLException;
 import java.io.FileInputStream;
 import java.text.ParseException;
 import java.text.SimpleDateFormat;
@@ -14,36 +12,52 @@ import java.util.Calendar;
 import java.util.Date;
 import java.util.List;
 
-public class JpredParserLocalFile {
-       private CassandraCreate cc = new CassandraCreate();
+import compbio.data.sequence.FastaReader;
+import compbio.data.sequence.FastaSequence;
+
+public class JpredParserLocalFile implements JpredParser {
        private String dirprefix;
-       
-       public void setSource (String newsourceprefix) {
+
+       public void setSource(String newsourceprefix) {
                this.dirprefix = newsourceprefix;
        }
 
-       JpredParserLocalFile() {
+       public JpredParserLocalFile() {
                this.dirprefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
        }
-       
-       JpredParserLocalFile(String sourceurl) {
+
+       public JpredParserLocalFile(String sourceurl) {
                this.dirprefix = sourceurl;
        }
 
-       public void Parsing(String source, int nDays) {
+       public void Parsing(String source, int nDays) throws IOException {
                Calendar cal = Calendar.getInstance();
                cal.add(Calendar.DATE, -nDays);
+               List<String> alljobs = new ArrayList<String>();
+               File file = new File(source);
+               BufferedReader alljobsfile = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
+               String line;
+
+               while ((line = alljobsfile.readLine()) != null) {
+                       alljobs.add(line);
+               }
+               alljobsfile.close();
+
+               System.out.println("Inserting jobs for " + nDays + " days, " + alljobs.size() + " jobs in total");
+               final long startTime = System.currentTimeMillis();
                for (int i = 0; i < nDays; ++i) {
                        cal.add(Calendar.DATE, 1);
                        int month = cal.get(Calendar.MONTH) + 1;
                        int year = cal.get(Calendar.YEAR);
                        int day = cal.get(Calendar.DATE);
                        String date = year + "/" + month + "/" + day;
-                       ParsingForDate(source, date);
+                       ParsingForDate(alljobs, date);
                }
+               final long execTime = System.currentTimeMillis() - startTime;
+               System.out.println("Execution Time = " + execTime + " ms");
        }
 
-       private void ParsingForDate(String input, String date) {
+       private void ParsingForDate(List<String> input, String date) {
                int totalcount = 0;
                int countNoData = 0;
                int countUnclearFASTAid = 0;
@@ -53,68 +67,57 @@ public class JpredParserLocalFile {
                int countStrange = 0;
 
                System.out.println("Inserting jobs for " + date);
-               try {
-                       File file = new File(input);
-                       BufferedReader alljobs = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
-                       String line;
-
-                       while ((line = alljobs.readLine()) != null) {
-                               if (line.matches(date + "(.*)jp_[^\\s]+")) {
-                                       String[] table = line.split("\\s+");
-                                       String id = table[table.length - 1];
-                                       totalcount++;
-                                       if (!cc.CheckID(id)) {
-                                               String confilename = dirprefix + "/" + id + "/" + id + ".concise";
-                                               File confile = new File(confilename);
-                                               if (confile.exists()) {
+               for (String in : input) {
+                       if (in.matches(date + ":(.*)jp_[^\\s]+")) {
+                               String[] table = in.split("\\s+");
+                               String starttime = table[0];
+                               String id = table[table.length - 1];
+                               totalcount++;
+                               String confilename = dirprefix + "/" + id + "/" + id + ".concise";
+                               File confile = new File(confilename);
+                               if (confile.exists()) {
+                                       try {
+                                               final FastaReader fr = new FastaReader(confilename);
+                                               final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
+                                               String newprotein = "";
+                                               while (fr.hasNext()) {
+                                                       final FastaSequence fs = fr.next();
+                                                       if (fs.getId().equals("QUERY") || fs.getId().equals(id))
+                                                               newprotein = fs.getSequence().replaceAll("\n", "");
+                                                       else if (fs.getId().equals("jnetpred") || fs.getId().equals("JNETPRED")) {
+                                                               seqs.add(fs);
+                                                       }
+                                               }
+                                               if (newprotein.equals("")) {
+                                                       countUnclearFASTAid++;
+                                               } else {
+                                                       SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
+                                                       String dateInString1 = starttime.substring(0, starttime.indexOf(":"));
                                                        try {
-                                                               final FastaReader fr = new FastaReader(confilename);
-                                                               final List<FastaSequence> seqs = new ArrayList<FastaSequence>();
-                                                               String newprotein = "";
-                                                               while (fr.hasNext()) {
-                                                                       final FastaSequence fs = fr.next();
-                                                                       if (fs.getId().equals("QUERY") || fs.getId().equals(id))
-                                                                               newprotein = fs.getSequence().replaceAll("\n", "");
-                                                                       else
-                                                                               seqs.add(fs);
-                                                               }
-                                                               if (newprotein.equals("")) {
-                                                                       countUnclearFASTAid++;
-                                                               } else {
-                                                                       SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd");
-                                                                       String dateInString1 = table[0].substring(0, table[0].indexOf(":"));
-                                                                       long dateWork1 = 0;
-                                                                       try {
-                                                                               Date dat1 = formatter.parse(dateInString1);
-                                                                               dateWork1 = dat1.getTime();
-                                                                       } catch (ParseException e) {
-                                                                               e.printStackTrace();
-                                                                       }
-                                                                       cc.InsertData(dateWork1, table[0], table[1], table[2], id, "OK", "OK", newprotein, seqs);
-                                                                       ++countinsertions;
-                                                                       // flush every 100 insertions
-                                                                       if (0 == countinsertions % 100) {
-                                                                               cc.flushData();
-                                                                       }
-                                                               }
-                                                       } catch (IOException e) {
+                                                               Date dat = formatter.parse(dateInString1);
+                                                       } catch (ParseException e) {
                                                                e.printStackTrace();
                                                        }
-                                               } else {
-                                                       countNoData++;
+                                                       // countinsertions += cw.FormQueryTables(insertdate,
+                                                       // starttime, finishtime, ip, id, "OK", "OK",
+                                                       // newprotein, seqs);
                                                }
-                                       } else {
-                                               ++countinserted;
+                                               fr.close();
+                                       } catch (IOException e) {
+                                               e.printStackTrace();
                                        }
                                } else {
-                                       if (line.matches(date + "(.*)Sequence0/(.*)")) {
-                                               ++counAlignments;
-                                       } else {
-                                               ++countStrange;
-                                       }
+                                       countNoData++;
+                               }
+                       } else {
+                               if (in.matches(date + "(.*)Sequence0/(.*)")) {
+                                       ++counAlignments;
+                               } else {
+                                       ++countStrange;
                                }
                        }
-                       alljobs.close();
+               }
+               if (true) {
                        System.out.println("Total number of jobs = " + totalcount);
                        System.out.println("   " + countinserted + " jobs inserted already");
                        System.out.println("   " + counAlignments + " jalview jobs");
@@ -122,10 +125,7 @@ public class JpredParserLocalFile {
                        System.out.println("   " + countNoData + " jobs without *.concise.fasta file");
                        System.out.println("   " + countUnclearFASTAid + " jobs with unclear FASTA protein id in *.concise.fasta");
                        System.out.println("   " + countinsertions + " new job insertions\n");
-               } catch (MalformedURLException e) {
-                       e.printStackTrace();
-               } catch (IOException e) {
-                       e.printStackTrace();
                }
        }
+
 }