b147499123d0c28796b0fb5dd43446331a60c885
[proteocache.git] / datadb / compbio / cassandra / CassandraCreate.java
1 package compbio.cassandra;
2
3 import java.util.Arrays;
4 import java.util.List;
5
6 import me.prettyprint.cassandra.serializers.LongSerializer;
7 import me.prettyprint.cassandra.serializers.StringSerializer;
8 import me.prettyprint.cassandra.service.ThriftKsDef;
9 import me.prettyprint.hector.api.Cluster;
10 import me.prettyprint.hector.api.Keyspace;
11 import me.prettyprint.hector.api.beans.ColumnSlice;
12 import me.prettyprint.hector.api.ddl.ColumnFamilyDefinition;
13 import me.prettyprint.hector.api.ddl.ComparatorType;
14 import me.prettyprint.hector.api.ddl.KeyspaceDefinition;
15 import me.prettyprint.hector.api.factory.HFactory;
16 import me.prettyprint.hector.api.mutation.Mutator;
17 import me.prettyprint.hector.api.query.QueryResult;
18 import me.prettyprint.hector.api.query.SliceQuery;
19
20 public class CassandraCreate {
21         private static Keyspace ksp;
22         private static Cluster cluster;
23         private static Mutator<Long> mutatorLong;
24         private static Mutator<String> mutatorString;
25         private static Mutator<String> mutatorLog;
26         StringSerializer ss = StringSerializer.get();
27         LongSerializer ls = LongSerializer.get();
28
29         /*
30          * connect to the cluster and look weather the dababase has any data inside
31          */
32         public void Connection() {
33                 cluster = HFactory.getOrCreateCluster("Protein Cluster", "127.0.0.1:9160");
34                 KeyspaceDefinition keyspaceDef = cluster.describeKeyspace("ProteinKeyspace");
35                 /*
36                  * If keyspace does not exist, the CFs don't exist either. => create
37                  * them.
38                  */
39                 if (keyspaceDef == null) { // create column family
40                         System.out.println("ProteinKeyspace has been null");
41                         ColumnFamilyDefinition cfProtein = HFactory.createColumnFamilyDefinition("ProteinKeyspace", "ProteinRow",
42                                         ComparatorType.ASCIITYPE);
43                         ColumnFamilyDefinition cfLog = HFactory.createColumnFamilyDefinition("ProteinKeyspace", "ProteinLog", ComparatorType.ASCIITYPE);
44                         ColumnFamilyDefinition cfData = HFactory.createColumnFamilyDefinition("ProteinKeyspace", "ProteinData",
45                                         ComparatorType.ASCIITYPE);
46
47                         KeyspaceDefinition newKeyspace = HFactory.createKeyspaceDefinition("ProteinKeyspace", ThriftKsDef.DEF_STRATEGY_CLASS, 1,
48                                         Arrays.asList(cfProtein, cfLog, cfData));
49                         /*
50                          * Add the schema to the cluster. "true" as the second param means
51                          * that Hector will be blocked until all nodes see the change.
52                          */
53                         cluster.addKeyspace(newKeyspace, true);
54                         cluster.addColumnFamily(cfProtein, true);
55                         cluster.addColumnFamily(cfLog, true);
56                         cluster.addColumnFamily(cfData, true);
57                 } else {
58                         System.out.println("Data loaded");
59                 }
60                 ksp = HFactory.createKeyspace("ProteinKeyspace", cluster);
61                 System.out.println("Cassandra has been connected");
62         }
63
64         /*
65          * parsing data source and filling the database
66          */
67         public void Parsing(String source) {
68                 /*
69                  * CF ProteinRow store protein and prediction
70                  */
71                 mutatorString = HFactory.createMutator(ksp, ss);
72
73                 /*
74                  * ProteinLog stores logging info: IP, job id, start date and end date
75                  */
76                 mutatorLog = HFactory.createMutator(ksp, ss);
77
78                 /*
79                  * CF ProteinData store id and protein per data
80                  */
81                 mutatorLong = HFactory.createMutator(ksp, ls);
82
83                 if (true) {
84                 //if (source.equals("http")) {
85                         // get data from real Jpred production server
86                         System.out.println("Parsing web data source......");
87                         String datasrc = "http://www.compbio.dundee.ac.uk/www-jpred/results/usage-new/alljobs.dat";
88                         String prefix = "http://www.compbio.dundee.ac.uk/www-jpred/results";
89                         JpredParserHTTP parser = new JpredParserHTTP(prefix);
90                         parser.Parsing(datasrc, 4);
91                         flushData();
92                 }
93                 if (true) {
94                 //if (source.equals("file")) {
95                         // get irtifical data generated for the DB stress tests
96                         System.out.println("Parsing local file data source......");
97                         String datasrc = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/data.dat";
98                         String prefix = "/home/asherstnev/Projects/Java.projects/proteocache/data_stress_test/Jpreddata";
99                         JpredParserLocalFile parser = new JpredParserLocalFile(prefix);
100                         parser.Parsing(datasrc, 190);
101                         flushData();
102                 }
103         }
104
105         public void flushData() {
106                 mutatorString.execute();
107                 mutatorLong.execute();
108                 mutatorLog.execute();
109                 // System.out.println("Flush new data...");
110         }
111
112         public void Closing() {
113                 cluster.getConnectionManager().shutdown();
114                 System.out.println("Cassandra has been shut down");
115         }
116
117         /*
118          * check whether the job id exists in the DB
119          */
120         public boolean CheckID(String jobid) {
121                 SliceQuery<String, String, String> sliceQuery = HFactory.createSliceQuery(ksp, ss, ss, ss);
122                 sliceQuery.setColumnFamily("ProteinLog").setKey(jobid).setRange("", "", false, 100);
123                 QueryResult<ColumnSlice<String, String>> result = sliceQuery.execute();
124                 if (result.get().getColumns().size() > 0) {
125                         return true;
126                 }
127                 return false;
128         }
129
130         /*
131          * prepare data for insertion into the db
132          */
133         public void InsertData(long dataWork, String dataBegin, String dataEnd, String ip, String id, String statusEx, String statusFinal,
134                         String protein, List<FastaSequence> jnetpred) {
135                 mutatorLog.addInsertion(id, "ProteinLog", HFactory.createColumn("ip", ip, ss, ss))
136                                 .addInsertion(id, "ProteinLog", HFactory.createColumn("DataBegin", dataBegin, ss, ss))
137                                 .addInsertion(id, "ProteinLog", HFactory.createColumn("DataEnd", dataEnd, ss, ss))
138                                 .addInsertion(id, "ProteinLog", HFactory.createColumn("Status ex", statusEx, ss, ss))
139                                 .addInsertion(id, "ProteinLog", HFactory.createColumn("Status final", statusFinal, ss, ss))
140                                 .addInsertion(id, "ProteinLog", HFactory.createColumn("Protein", protein, ss, ss));
141                 for (int i = 0; i < jnetpred.size(); i++) {
142                         String namepred = jnetpred.get(i).getId();
143                         String pred = jnetpred.get(i).getSequence().replaceAll("\n", "");
144                         mutatorString.addInsertion(protein, "ProteinRow", HFactory.createColumn(id + ";" + namepred, pred, ss, ss));
145                 }
146                 mutatorLong.addInsertion(dataWork, "ProteinData", HFactory.createColumn(id, protein, ss, ss));
147         }
148
149         public Keyspace GetKeyspace() {
150                 return ksp;
151         }
152 }