package compbio.statistic; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.Iterator; import java.util.List; import compbio.cassandra.CassandraNativeConnector; import compbio.cassandra.DataBase; public class StatisticsProt { private CassandraNativeConnector cc = new CassandraNativeConnector(); private ArrayList query; private static long currentDate = 0; private static long earlestDate = 0; /* * query: the period from date1 till date2 * */ public List readDetails(String date1, String date2) { if (!isThisDateValid(date1) || !isThisDateValid(date2)) { System.out.println("Wrong date: point 1"); return null; } SetDateRange(); long dateStart = DateParsing(date1); long dateEnd = DateParsing(date2); if ((dateStart < earlestDate && dateEnd < earlestDate) || (dateStart > currentDate && dateEnd > currentDate) || dateStart > dateEnd) return null; if (dateStart < earlestDate) dateStart = earlestDate; if (dateEnd > currentDate) dateStart = currentDate; System.out.println("StatisticsProt.readDetails: earlestDate = " + earlestDate + ", currentDate = " + currentDate); System.out.println("StatisticsProt.readDetails: Start date " + date1 + ": int representation = " + dateStart); System.out.println("StatisticsProt.readDetails: End date " + date2 + ": int representation = " + dateEnd); Calendar start = Calendar.getInstance(); start.setTime(new Date(dateStart)); Calendar end = Calendar.getInstance(); end.setTime(new Date(dateEnd)); query = new ArrayList(); int day = 0; /* for (Date date = start.getTime(); !start.after(end); start.add(Calendar.DATE, 1), date = start.getTime()) { SliceQuery result = HFactory.createSliceQuery(cc.GetKeyspace(), LongSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinData"); result.setKey(date.getTime()); result.setRange(null, null, false, Integer.MAX_VALUE); QueryResult> columnSlice = result.execute(); ++day; System.out.print("Day " + day + ": dataStart = " + date + ": "); if (!columnSlice.get().getColumns().isEmpty()) { DataBase db = new DataBase(DateFormat(date.getTime()), columnSlice.get().getColumns().size()); query.add(db); System.out.println("data exist"); } else { System.out.println("no data"); } } */ System.out.println("StatisticsProt.readLength: total number of dates = " + query.size()); return query; } /* * query: execution time for the period from date1 till date2 * */ public List readLength(String date1, String date2) { if (!isThisDateValid(date1) || !isThisDateValid(date2)) { System.out.println("Wrong date: point 3"); return null; } SetDateRange(); int nbins = 5; long dateStart = DateParsing(date1); long dateEnd = DateParsing(date2); if ((dateStart < earlestDate && dateEnd < earlestDate) || (dateStart > currentDate && dateEnd > currentDate) || dateStart > dateEnd) return null; if (dateStart < earlestDate) dateStart = earlestDate; if (dateEnd > currentDate) dateStart = currentDate; System.out.println("StatisticsProt.readLength: earlestDate = " + earlestDate + ", currentDate = " + currentDate); System.out.println("StatisticsProt.readLength: Start date is " + date1 + ": int representation = " + dateStart); System.out.println("StatisticsProt.readLength: End date is " + date2 + ": int representation = " + dateEnd); Calendar start = Calendar.getInstance(); start.setTime(new Date(dateStart)); Calendar end = Calendar.getInstance(); end.setTime(new Date(dateEnd)); query = new ArrayList(); List totalTime = new ArrayList(); for (int i = 0; i < nbins; i++) totalTime.add(i, 0); /* for (Date date = start.getTime(); !start.after(end); start.add(Calendar.DATE, 1), date = start.getTime()) { List timeResult = new ArrayList(); SliceQuery result = HFactory.createSliceQuery(cc.GetKeyspace(), LongSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinData"); result.setKey(date.getTime()); result.setRange(null, null, false, Integer.MAX_VALUE); QueryResult> columnSlice = result.execute(); List> col = columnSlice.get().getColumns(); if (!col.isEmpty()) { Iterator> itCol = col.iterator(); for (int i = 0; i < nbins; i++) timeResult.add(i, 0); // split all jobs into nbins bins while (itCol.hasNext()) { String id = itCol.next().getName(); long lenResult = CountID(id); if (lenResult <= 30) timeResult.set(0, timeResult.get(0) + 1); else if (lenResult > 30 && lenResult <= 60) timeResult.set(1, timeResult.get(1) + 1); else if (lenResult > 60 && lenResult <= 120) timeResult.set(2, timeResult.get(2) + 1); else if (lenResult > 120 && lenResult <= 600) timeResult.set(3, timeResult.get(3) + 1); else { timeResult.set(4, timeResult.get(4) + 1); } } for (int i = 0; i < nbins; i++) totalTime.set(i, totalTime.get(i) + timeResult.get(i)); DataBase db = new DataBase(); db.setTimeRez(timeResult); db.setDate(DateFormat(date.getTime())); query.add(db); } } */ DataBase db = new DataBase(); db.setTimeTotalExec(totalTime); query.add(db); System.out.println("StatisticsProt.readLength: total number of dates = " + query.size()); return query; } /* * query: protein sequence * */ public List readProteins(String protIn) { query = new ArrayList(); /* SliceQuery result = HFactory.createSliceQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinRow"); result.setKey(protIn); result.setRange(null, null, false, Integer.MAX_VALUE); QueryResult> columnSlice = result.execute(); Iterator> it = columnSlice.get().getColumns().iterator(); while (it.hasNext()) { HColumn col = it.next(); String name = col.getName(); if (name.matches("(.*)jnetpred")) { DataBase db = new DataBase(); db.setProt(protIn); db.setId(col.getName()); db.setJpred(col.getValue()); query.add(db); } } */ return query; } /* * query by a protein sequence * */ public List readProtID(int counter) { query = new ArrayList(); int row_count = 100; /* RangeSlicesQuery result = HFactory.createRangeSlicesQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinRow"); result.setRange(null, null, false, 100); result.setRowCount(row_count); String last_key = null; while (true) { result.setKeys(last_key, null); QueryResult> columnSlice = result.execute(); OrderedRows rows = columnSlice.get(); Iterator> rowsIterator = rows.iterator(); while (rowsIterator.hasNext()) { Row row = rowsIterator.next(); last_key = row.getKey(); List> clms = row.getColumnSlice().getColumns(); //int npred = 0; //for (HColumn cln : clms) { // String name = cln.getName(); // if (name.matches("(.*)jnetpred")) { // ++npred; // } //} int npred = clms.size(); if (npred > counter) { DataBase db = new DataBase(); db.setProt(last_key); db.setTotalId(npred); query.add(db); } } if (rows.getCount() < row_count) break; }*/ return query; } /* * query by a part of sequence * */ public List readPart(String protIn) { int row_count = 10000; query = new ArrayList(); /* RangeSlicesQuery result = HFactory.createRangeSlicesQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinRow"); result.setRange(null, null, false, Integer.MAX_VALUE); result.setRowCount(row_count); String last_key = null; while (true) { result.setKeys(last_key, null); QueryResult> columnSlice = result.execute(); OrderedRows rows = columnSlice.get(); Iterator> rowsIterator = rows.iterator(); while (rowsIterator.hasNext()) { Row row = rowsIterator.next(); last_key = row.getKey(); if (last_key.matches("(.*)" + protIn + "(.*)")) { Iterator> it = row.getColumnSlice().getColumns().iterator(); while (it.hasNext()) { HColumn col = it.next(); List subProt = new ArrayList(); String subStr = last_key; while (subStr.length() > 0 && subStr.contains(protIn)) { String first = subStr.substring(0, subStr.indexOf(protIn)); if (first.length() > 0) subProt.add(first); subProt.add(protIn); subStr = subStr.substring(subStr.indexOf(protIn) + protIn.length(), subStr.length()); } if (subStr.length() > 0) subProt.add(subStr); String name = col.getName(); if (name.matches("(.*)jnetpred")) { DataBase db = new DataBase(); db.setProt(last_key); db.setId(col.getName()); db.setJpred(col.getValue()); db.setSubProt(subProt); query.add(db); } } } } if (rows.getCount() < row_count) break; } */ return query; } /* * convert String date into long date (miliseconds since the epoch start) */ private static long DateParsing(String datInput) { if (datInput == null) { return 0; } long dateWorkSt = 0; SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd"); try { dateWorkSt = formatter.parse(datInput).getTime(); } catch (ParseException e) { e.printStackTrace(); } return dateWorkSt; } /* * convert String date:time into long date:time (miliseconds since the epoch start) */ private static long TimeConvert(String datInput) { long dateWorkSt = 0; if (datInput == null) { return dateWorkSt; } SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd:hh:mm:ss"); try { dateWorkSt = formatter.parse(datInput).getTime(); } catch (ParseException e) { e.printStackTrace(); } return dateWorkSt; } // convert long to date in string format private static String DateFormat(long inDate) { SimpleDateFormat datformat = new SimpleDateFormat("dd/MM/yyyy"); String dateString = datformat.format(new Date(inDate)); return dateString; } /* * convert ??? */ public static String DateFormatYYMMDD(long indate) { SimpleDateFormat datformat = new SimpleDateFormat("yyyy/MM/dd"); String dateString = datformat.format(new Date(indate)); return dateString; } /* * ??? */ public long CountID(String id) { /* SliceQuery sliceQuery = HFactory.createSliceQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); sliceQuery.setColumnFamily("ProteinLog").setKey(id).setRange("", "", false, 100); QueryResult> result = sliceQuery.execute(); String datBegin = result.get().getColumnByName("DataBegin").getValue(); String datEnd = result.get().getColumnByName("DataEnd").getValue(); long datBeginLong = TimeConvert(datBegin); long datEndLong = TimeConvert(datEnd); return (datEndLong - datBeginLong) / 1000; */ return 0; } /* * set earlest date and current dates. * earlestDate is static and should be set at the 1st call * currentDate should be re-calculated every time */ private static void SetDateRange() { if (0 == earlestDate) { StatisticsProt sp = new StatisticsProt(); earlestDate = sp.earliestDate(); System.out.println("Set earlest Date = " + earlestDate); } Calendar cal = Calendar.getInstance(); currentDate = DateParsing(cal.get(Calendar.YEAR) + "/" + (cal.get(Calendar.MONTH) + 1) + "/" + cal.get(Calendar.DAY_OF_MONTH)); } public boolean isThisDateValid(String dateToValidate) { if (dateToValidate == null || dateToValidate.equals("")) { System.out.println("Undefined date"); return false; } SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd"); try { // if not valid, this will throw ParseException sdf.setLenient(false); Date date = sdf.parse(dateToValidate); } catch (ParseException e) { e.printStackTrace(); return false; } return true; } /* * find the earliest date in the database */ public long earliestDate() { ArrayList dateSort = new ArrayList(); int row_count = 10000; /* RangeSlicesQuery result = HFactory.createRangeSlicesQuery(cc.GetKeyspace(), LongSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinData"); result.setRange(null, null, false, Integer.MAX_VALUE); result.setRowCount(row_count); Long last_key = null; while (true) { result.setKeys(last_key, null); QueryResult> columnSlice = result.execute(); OrderedRows rows = columnSlice.get(); Iterator> rowsIterator = rows.iterator(); while (rowsIterator.hasNext()) { Row row = rowsIterator.next(); last_key = row.getKey(); dateSort.add(last_key); } if (rows.getCount() < row_count) break; }*/ Collections.sort(dateSort); return dateSort.get(0); } }