package compbio.statistic; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.Iterator; import java.util.List; import me.prettyprint.cassandra.serializers.LongSerializer; import me.prettyprint.cassandra.serializers.StringSerializer; import me.prettyprint.hector.api.beans.ColumnSlice; import me.prettyprint.hector.api.beans.HColumn; import me.prettyprint.hector.api.beans.OrderedRows; import me.prettyprint.hector.api.beans.Row; import me.prettyprint.hector.api.factory.HFactory; import me.prettyprint.hector.api.query.QueryResult; import me.prettyprint.hector.api.query.RangeSlicesQuery; import me.prettyprint.hector.api.query.SliceQuery; import compbio.cassandra.CassandraCreate; import compbio.cassandra.DataBase; public class StatisticsProt { private final static long MILLISECONDS_PER_DAY = 1000L * 60 * 60 * 24; private CassandraCreate cc = new CassandraCreate(); private ArrayList query; private static long currentDate = 0; private static long earlestDate = 0; /* query: the period from date1 till date2 */ public List readDetails(String date1, String date2) { if (!isThisDateValid(date1) || !isThisDateValid(date2)) { System.out.println("Wrong date: point 1"); return null; } SetDateRange(); long dateStart = DateParsing(date1); long dateEnd = DateParsing(date2); if (dateStart < earlestDate) dateStart = earlestDate; if (dateStart > currentDate) dateStart = currentDate - MILLISECONDS_PER_DAY; if (dateEnd < earlestDate) dateStart = earlestDate + MILLISECONDS_PER_DAY; if (dateEnd > currentDate) dateStart = currentDate; System.out.println("StatisticsProt.readDetails: earlestDate = " + earlestDate + ", currentDate = " + currentDate); System.out.println("StatisticsProt.readDetails: Start date " + date1 + ": int representation = " + dateStart); System.out.println("StatisticsProt.readDetails: End date " + date2 + ": int representation = " + dateEnd); query = new ArrayList(); int day = 0; while (dateStart <= dateEnd) { SliceQuery result = HFactory.createSliceQuery(cc.GetKeyspace(), LongSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinData"); result.setKey(dateStart); result.setRange(null, null, false, Integer.MAX_VALUE); QueryResult> columnSlice = result.execute(); ++day; System.out.print("Day " + day + ": dataStart = " + dateStart + ": "); if (!columnSlice.get().getColumns().isEmpty()) { DataBase db = new DataBase(DateFormat(dateStart), columnSlice.get().getColumns().size()); query.add(db); System.out.println("data exist"); } else { System.out.println("no data"); } dateStart += MILLISECONDS_PER_DAY; } System.out.println("StatisticsProt.readLength: total number of dates = " + query.size()); return query; } /* * query: execution time for the period from date1 till date2 */ public List readLength(String date1, String date2) { if (!isThisDateValid(date1) || !isThisDateValid(date2)) { System.out.println("Wrong date: point 3"); return null; } SetDateRange(); long dateStart = DateParsing(date1); long dateEnd = DateParsing(date2); if (dateStart < earlestDate) dateStart = earlestDate; if (dateStart > currentDate) dateStart = currentDate - MILLISECONDS_PER_DAY; if (dateEnd < earlestDate) dateStart = earlestDate + MILLISECONDS_PER_DAY; if (dateEnd > currentDate) dateStart = currentDate; System.out.println("StatisticsProt.readLength: earlestDate = " + earlestDate + ", currentDate = " + currentDate); System.out.println("StatisticsProt.readLength: Start date is " + date1 + ": int representation = " + dateStart); System.out.println("StatisticsProt.readLength: End date is " + date2 + ": int representation = " + dateEnd); query = new ArrayList(); List totalTime = new ArrayList(); for (int i = 0; i < 4; i++) totalTime.add(i, 0); while (dateStart <= dateEnd) { List timeResult = new ArrayList(); SliceQuery result = HFactory.createSliceQuery(cc.GetKeyspace(), LongSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinData"); result.setKey(dateStart); result.setRange(null, null, false, Integer.MAX_VALUE); QueryResult> columnSlice = result.execute(); List> col = columnSlice.get().getColumns(); if (!col.isEmpty()) { Iterator> itCol = col.iterator(); for (int i = 0; i < 4; i++) timeResult.add(i, 0); while (itCol.hasNext()) { String id = itCol.next().getName(); long lenResult = CountID(id); if (lenResult <= 30) timeResult.set(0, timeResult.get(0) + 1); else if (lenResult > 30 && lenResult <= 60) timeResult.set(1, timeResult.get(1) + 1); else if (lenResult > 60 && lenResult <= 120) timeResult.set(2, timeResult.get(2) + 1); else { timeResult.set(3, timeResult.get(3) + 1); } } for (int i = 0; i < 4; i++) totalTime.set(i, totalTime.get(i) + timeResult.get(i)); DataBase db = new DataBase(); db.setTimeRez(timeResult); db.setDate(DateFormat(dateStart)); query.add(db); } dateStart += MILLISECONDS_PER_DAY; } DataBase db = new DataBase(); db.setTimeTotalExec(totalTime); query.add(db); System.out.println("StatisticsProt.readLength: total number of dates = " + query.size()); return query; } /* query: protein sequence */ public List readProteins(String protIn) { query = new ArrayList(); SliceQuery result = HFactory.createSliceQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinRow"); result.setKey(protIn); result.setRange(null, null, false, Integer.MAX_VALUE); QueryResult> columnSlice = result.execute(); Iterator> it = columnSlice.get().getColumns().iterator(); while (it.hasNext()) { HColumn col = it.next(); String name = col.getName(); if (name.matches("(.*)jnetpred")) { DataBase db = new DataBase(); db.setProt(protIn); db.setId(col.getName()); db.setJpred(col.getValue()); query.add(db); } } return query; } // query by a protein sequence public List readProtID(int counter) { query = new ArrayList(); int row_count = 100000000; RangeSlicesQuery result = HFactory.createRangeSlicesQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinRow"); result.setRange(null, null, false, Integer.MAX_VALUE); result.setRowCount(row_count); String last_key = null; while (true) { result.setKeys(last_key, null); QueryResult> columnSlice = result.execute(); OrderedRows rows = columnSlice.get(); Iterator> rowsIterator = rows.iterator(); while (rowsIterator.hasNext()) { Row row = rowsIterator.next(); last_key = row.getKey(); List> clms = row.getColumnSlice().getColumns(); int npred = 0; for (HColumn cln : clms) { String name = cln.getName(); if (name.matches("(.*)jnetpred")) { ++npred; } } if (npred >= counter) { DataBase db = new DataBase(); db.setProt(last_key); db.setTotalId(npred); query.add(db); } } if (rows.getCount() < row_count) break; } return query; } // query by a part of sequence public List readPart(String protIn) { int row_count = 10000; query = new ArrayList(); RangeSlicesQuery result = HFactory.createRangeSlicesQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinRow"); result.setRange(null, null, false, Integer.MAX_VALUE); result.setRowCount(row_count); String last_key = null; while (true) { result.setKeys(last_key, null); QueryResult> columnSlice = result.execute(); OrderedRows rows = columnSlice.get(); Iterator> rowsIterator = rows.iterator(); while (rowsIterator.hasNext()) { Row row = rowsIterator.next(); last_key = row.getKey(); if (last_key.matches("(.*)" + protIn + "(.*)")) { Iterator> it = row.getColumnSlice().getColumns().iterator(); while (it.hasNext()) { HColumn col = it.next(); List subProt = new ArrayList(); String subStr = last_key; while (subStr.length() > 0 && subStr.contains(protIn)) { String first = subStr.substring(0, subStr.indexOf(protIn)); if (first.length() > 0) subProt.add(first); subProt.add(protIn); subStr = subStr.substring(subStr.indexOf(protIn) + protIn.length(), subStr.length()); } if (subStr.length() > 0) subProt.add(subStr); String name = col.getName(); if (name.matches("(.*)jnetpred")) { DataBase db = new DataBase(); db.setProt(last_key); db.setId(col.getName()); db.setJpred(col.getValue()); db.setSubProt(subProt); query.add(db); } } } } if (rows.getCount() < row_count) break; } return query; } // convert String to Date private static long DateParsing(String datInput) { if (datInput == null) { return 0; } long dateWorkSt = 0; SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd"); try { dateWorkSt = formatter.parse(datInput).getTime(); } catch (ParseException e) { e.printStackTrace(); } return dateWorkSt; } // convert String to Date private static long TimeConvert(String datInput) { long dateWorkSt = 0; if (datInput == null) { return dateWorkSt; } SimpleDateFormat formatter = new SimpleDateFormat("yyyy/MM/dd:hh:mm:ss"); try { dateWorkSt = formatter.parse(datInput).getTime(); } catch (ParseException e) { e.printStackTrace(); } return dateWorkSt; } // convert long to date in string format private static String DateFormat(long inDate) { SimpleDateFormat datformat = new SimpleDateFormat("dd/MM/yyyy"); String dateString = datformat.format(new Date(inDate)); return dateString; } /* * private static String DateFormat1(long inDate) { SimpleDateFormat * datformat = new SimpleDateFormat("yyyy/MM/dd:hh:mm:ss"); String * dateString = datformat.format(new Date(inDate)); return dateString; } */ public static String DateFormatYYMMDD(long indate) { SimpleDateFormat datformat = new SimpleDateFormat("yyyy/MM/dd"); String dateString = datformat.format(new Date(indate)); return dateString; } public long CountID(String id) { SliceQuery sliceQuery = HFactory.createSliceQuery(cc.GetKeyspace(), StringSerializer.get(), StringSerializer.get(), StringSerializer.get()); sliceQuery.setColumnFamily("ProteinLog").setKey(id).setRange("", "", false, 100); QueryResult> result = sliceQuery.execute(); String datBegin = result.get().getColumnByName("DataBegin").getValue(); String datEnd = result.get().getColumnByName("DataEnd").getValue(); long datBeginLong = TimeConvert(datBegin); long datEndLong = TimeConvert(datEnd); return (datEndLong - datBeginLong) / 1000; } private static void SetDateRange() { if (0 == earlestDate) { StatisticsProt sp = new StatisticsProt(); earlestDate = sp.earliestDate(); System.out.println("Set earlest Date = " + earlestDate); } Calendar cal = Calendar.getInstance(); currentDate = DateParsing(cal.get(Calendar.YEAR) + "/" + (cal.get(Calendar.MONTH) + 1) + "/" + cal.get(Calendar.DAY_OF_MONTH)); } public boolean isThisDateValid(String dateToValidate) { if (dateToValidate == null || dateToValidate.equals("")) { System.out.println("Undefined date"); return false; } SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd"); try { // if not valid, this will throw ParseException sdf.setLenient(false); Date date = sdf.parse(dateToValidate); } catch (ParseException e) { e.printStackTrace(); return false; } return true; } // find the earliest date public long earliestDate() { ArrayList dateSort = new ArrayList(); int row_count = 10000; RangeSlicesQuery result = HFactory.createRangeSlicesQuery(cc.GetKeyspace(), LongSerializer.get(), StringSerializer.get(), StringSerializer.get()); result.setColumnFamily("ProteinData"); result.setRange(null, null, false, Integer.MAX_VALUE); result.setRowCount(row_count); Long last_key = null; while (true) { result.setKeys(last_key, null); QueryResult> columnSlice = result.execute(); OrderedRows rows = columnSlice.get(); Iterator> rowsIterator = rows.iterator(); while (rowsIterator.hasNext()) { Row row = rowsIterator.next(); last_key = row.getKey(); dateSort.add(last_key); } if (rows.getCount() < row_count) break; } Collections.sort(dateSort); return dateSort.get(0); } }