JAL-3026 srcjar files for VARNA and log4j
[jalview.git] / srcjar / fr / orsay / lri / varna / utils / RNAMLParser.java
1 /*
2  VARNA is a tool for the automated drawing, visualization and annotation of the secondary structure of RNA, designed as a companion software for web servers and databases.
3  Copyright (C) 2008  Kevin Darty, Alain Denise and Yann Ponty.
4  electronic mail : Yann.Ponty@lri.fr
5  paper mail : LRI, bat 490 Universit� Paris-Sud 91405 Orsay Cedex France
6
7  This file is part of VARNA version 3.1.
8  VARNA version 3.1 is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License
9  as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
10
11  VARNA version 3.1 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
12  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13  See the GNU General Public License for more details.
14
15  You should have received a copy of the GNU General Public License along with VARNA version 3.1.
16  If not, see http://www.gnu.org/licenses.
17  */
18 package fr.orsay.lri.varna.utils;
19
20 import java.awt.Point;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.InputStreamReader;
24 import java.io.StringReader;
25 import java.net.URL;
26 import java.net.URLConnection;
27 import java.util.ArrayList;
28 import java.util.Collections;
29 import java.util.Hashtable;
30 import java.util.List;
31 import java.util.Stack;
32 import java.util.Vector;
33
34 import org.xml.sax.Attributes;
35 import org.xml.sax.InputSource;
36 import org.xml.sax.SAXException;
37 import org.xml.sax.helpers.DefaultHandler;
38
39 import fr.orsay.lri.varna.models.rna.ModeleBP;
40 import fr.orsay.lri.varna.models.rna.ModeleBase;
41 import fr.orsay.lri.varna.models.rna.ModeleBP.Edge;
42 import fr.orsay.lri.varna.models.rna.ModeleBP.Stericity;
43
44 public class RNAMLParser extends DefaultHandler {
45         public class HelixTemp {
46                 public int pos5, pos3, length;
47                 public String name;
48
49                 public HelixTemp(int pos5, int pos3, int length, String name) {
50                         this.pos3 = pos3;
51                         this.pos5 = pos5;
52                         this.length = length;
53                         this.name = name;
54                 }
55
56                 public String toString() {
57                         return ("[" + name + "," + pos5 + "," + pos3 + "," + length + "]");
58                 }
59
60         }
61
62         public class BPTemp {
63                 public int pos5, pos3;
64                 public String edge5, edge3, orientation;
65
66                 public BPTemp(int pos5, int pos3, String edge5, String edge3,
67                                 String orientation) {
68                         if (edge3 == null) {
69                                 edge3 = "+";
70                         }
71                         if (edge5 == null) {
72                                 edge5 = "+";
73                         }
74                         if (orientation == null) {
75                                 orientation = "c";
76                         }
77                         this.pos5 = pos5;
78                         this.pos3 = pos3;
79                         this.edge5 = edge5;
80                         this.edge3 = edge3;
81                         this.orientation = orientation;
82                 }
83
84                 public ModeleBP createBPStyle(ModeleBase mb5, ModeleBase mb3) {
85                         ModeleBP.Edge e5, e3;
86                         @SuppressWarnings("unused")
87                         boolean isCanonical = false;
88                         if (edge5.equals("W")) {
89                                 e5 = ModeleBP.Edge.WC;
90                         } else if (edge5.equals("H")) {
91                                 e5 = ModeleBP.Edge.HOOGSTEEN;
92                         } else if (edge5.equals("S")) {
93                                 e5 = ModeleBP.Edge.SUGAR;
94                         } else {
95                                 e5 = ModeleBP.Edge.WC;
96                         }
97
98                         if (edge3.equals("W")) {
99                                 e3 = ModeleBP.Edge.WC;
100                         } else if (edge3.equals("H")) {
101                                 e3 = ModeleBP.Edge.HOOGSTEEN;
102                         } else if (edge3.equals("S")) {
103                                 e3 = ModeleBP.Edge.SUGAR;
104                         } else {
105                                 e3 = ModeleBP.Edge.WC;
106                         }
107
108                         if ((edge5.equals("+") && edge3.equals("+"))
109                                         || (edge5.equals("-") && edge3.equals("-"))) {
110                                 e3 = ModeleBP.Edge.WC;
111                                 e5 = ModeleBP.Edge.WC;
112                         }
113
114                         ModeleBP.Stericity ster;
115
116                         if (orientation.equals("c")) {
117                                 ster = ModeleBP.Stericity.CIS;
118                         } else if (orientation.equals("t")) {
119                                 ster = ModeleBP.Stericity.TRANS;
120                         } else {
121                                 ster = ModeleBP.Stericity.CIS;
122                         }
123
124                         return (new ModeleBP(mb5, mb3, e5, e3, ster));
125                 }
126
127                 public String toString() {
128                         return ("[" + pos5 + "," + pos3 + "," + edge5 + "," + edge3 + ","
129                                         + orientation + "]");
130                 }
131         }
132
133         public class RNATmp {
134                 public ArrayList<String> _sequence = new ArrayList<String>();
135                 public Vector<Integer> _sequenceIDs = new Vector<Integer>();
136                 public Vector<BPTemp> _structure = new Vector<BPTemp>();
137                 public Vector<HelixTemp> _helices = new Vector<HelixTemp>();
138
139                 public ArrayList<String> getSequence() {
140                         return _sequence;
141                 }
142
143                 public Vector<BPTemp> getStructure() {
144                         return _structure;
145                 }
146         };
147
148         private Hashtable<String, RNATmp> _molecules = new Hashtable<String, RNATmp>();
149
150         private boolean _inSequenceIDs, _inLength, _inSequence, _inHelix,
151                         _inStrAnnotation, _inBP, _inBP5, _inBP3, _inEdge5, _inEdge3,
152                         _inPosition, _inBondOrientation, _inMolecule;
153         private StringBuffer _buffer;
154         private String _currentModel = "";
155         private int _id5, _id3, _length;
156         String _edge5, _edge3, _orientation, _helixID;
157
158         public RNAMLParser() {
159                 super();
160                 _inSequenceIDs = false;
161                 _inSequence = false;
162                 _inStrAnnotation = false;
163                 _inBP = false;
164                 _inBP5 = false;
165                 _inBP3 = false;
166                 _inPosition = false;
167                 _inEdge5 = false;
168                 _inEdge3 = false;
169                 _inBondOrientation = false;
170                 _inHelix = false;
171                 _inMolecule = false;
172         }
173
174         public InputSource createSourceFromURL(String path) {
175                 URL url = null;
176                 try {
177                         url = new URL(path);
178                         URLConnection connexion = url.openConnection();
179                         connexion.setUseCaches(false);
180                         InputStream r = connexion.getInputStream();
181                         InputStreamReader inr = new InputStreamReader(r);
182                         return new InputSource(inr);
183                 } catch (Exception e) {
184                         e.printStackTrace();
185                 }
186                 return new InputSource(new StringReader(""));
187         }
188
189         public InputSource resolveEntity(String publicId, String systemId) {
190                 // System.out.println("[crade]");
191                 if (systemId.endsWith("rnaml.dtd"))
192                 {
193                         String resourceName = "/rnaml.dtd";
194                         URL url = ClassLoader.getSystemResource(resourceName);
195                         if (url!=null)
196                         {
197                                 try {
198                                         InputStream stream = url.openStream();
199                                         if (stream != null)
200                                         {
201                                                 return new InputSource(stream );
202                                         }
203                                 } catch (IOException e) {
204                                         e.printStackTrace();
205                                 }
206                         }
207                 }
208                 return new InputSource(new StringReader(""));
209         }
210
211         public void startElement(String uri, String localName, String qName,
212                         Attributes attributes) throws SAXException {
213                 if (qName.equals("numbering-table")) {
214                         _inSequenceIDs = true;
215                         _buffer = new StringBuffer();
216                 } else if (qName.equals("helix")) {
217                         _inHelix = true;
218                         _buffer = new StringBuffer();
219                         _helixID = attributes.getValue("id");
220                 } else if (qName.equals("seq-data")) {
221                         _inSequence = true;
222                         _buffer = new StringBuffer();
223                 } else if (qName.equals("length")) {
224                         _inLength = true;
225                         _buffer = new StringBuffer();
226                 } else if (qName.equals("str-annotation")) {
227                         _inStrAnnotation = true;
228                 } else if (qName.equals("base-pair")) {
229                         _inBP = true;
230                 } else if (qName.equals("base-id-5p")) {
231                         if (_inBP || _inHelix) {
232                                 _inBP5 = true;
233                         }
234                 } else if (qName.equals("base-id-3p")) {
235                         if (_inBP || _inHelix) {
236                                 _inBP3 = true;
237                         }
238                 } else if (qName.equals("edge-5p")) {
239                         _inEdge5 = true;
240                         _buffer = new StringBuffer();
241                 } else if (qName.equals("edge-3p")) {
242                         _inEdge3 = true;
243                         _buffer = new StringBuffer();
244                 } else if (qName.equals("position")) {
245                         _inPosition = true;
246                         _buffer = new StringBuffer();
247                 } else if (qName.equals("bond-orientation")) {
248                         _inBondOrientation = true;
249                         _buffer = new StringBuffer();
250                 } else if (qName.equals("molecule")) {
251                         _inMolecule = true;
252                         String id = (attributes.getValue("id"));
253                         // System.err.println("Molecule#"+id);
254                         _molecules.put(id, new RNATmp());
255                         _currentModel = id;
256                 } else {
257                         // We don't care too much about the rest ...
258                 }
259         }
260
261         public void endElement(String uri, String localName, String qName)
262                         throws SAXException {
263                 if (qName.equals("numbering-table")) {
264                         _inSequenceIDs = false;
265                         String content = _buffer.toString();
266                         content = content.trim();
267                         String[] tokens = content.split("\\s+");
268                         Vector<Integer> results = new Vector<Integer>();
269                         for (int i = 0; i < tokens.length; i++) {
270                                 try {
271                                         results.add(new Integer(Integer.parseInt(tokens[i])));
272                                 } catch (NumberFormatException e) {
273                                         e.printStackTrace();
274                                 }
275                         }
276                         _molecules.get(_currentModel)._sequenceIDs = results;
277                         _buffer = null;
278                 } else if (qName.equals("seq-data")) {
279                         _inSequence = false;
280                         String content = _buffer.toString();
281                         content = content.trim();
282                         String[] tokens = content.split("\\s+");
283                         ArrayList<String> results = new ArrayList<String>();
284                         for (int i = 0; i < tokens.length; i++) {
285                                 for (int j = 0; j < tokens[i].length(); j++)
286                                         results.add("" + tokens[i].charAt(j));
287                         }
288                         // System.err.println("  Seq: "+results);
289                         _molecules.get(_currentModel)._sequence = results;
290                         _buffer = null;
291                 } else if (qName.equals("bond-orientation")) {
292                         _inBondOrientation = false;
293                         String content = _buffer.toString();
294                         content = content.trim();
295                         _orientation = content;
296                         _buffer = null;
297                 } else if (qName.equals("str-annotation")) {
298                         _inStrAnnotation = false;
299                 } else if (qName.equals("base-pair")) {
300                         if (_inMolecule) {
301                                 _inBP = false;
302                                 BPTemp bp = new BPTemp(_id5, _id3, _edge5, _edge3, _orientation);
303                                 _molecules.get(_currentModel)._structure.add(bp);
304                                 // System.err.println("  "+bp);
305                         }
306                 } else if (qName.equals("helix")) {
307                         _inHelix = false;
308                         if (_inMolecule) {
309                                 HelixTemp h = new HelixTemp(_id5, _id3, _length, _helixID);
310                                 _molecules.get(_currentModel)._helices.add(h);
311                         }
312                 } else if (qName.equals("base-id-5p")) {
313                         _inBP5 = false;
314                 } else if (qName.equals("base-id-3p")) {
315                         _inBP3 = false;
316                 } else if (qName.equals("length")) {
317                         _inLength = false;
318                         String content = _buffer.toString();
319                         content = content.trim();
320                         _length = Integer.parseInt(content);
321                         _buffer = null;
322                 } else if (qName.equals("position")) {
323                         String content = _buffer.toString();
324                         content = content.trim();
325                         int pos = Integer.parseInt(content);
326                         if (_inBP5) {
327                                 _id5 = pos;
328                         }
329                         if (_inBP3) {
330                                 _id3 = pos;
331                         }
332                         _buffer = null;
333                 } else if (qName.equals("edge-5p")) {
334                         _inEdge5 = false;
335                         String content = _buffer.toString();
336                         content = content.trim();
337                         _edge5 = content;
338                         _buffer = null;
339                 } else if (qName.equals("edge-3p")) {
340                         _inEdge3 = false;
341                         String content = _buffer.toString();
342                         content = content.trim();
343                         _edge3 = content;
344                         _buffer = null;
345                 } else if (qName.equals("molecule")) {
346                         _inMolecule = false;
347                 } else {
348                         // We don't care too much about the rest ...
349                 }
350         }
351
352         public void characters(char[] ch, int start, int length)
353                         throws SAXException {
354                 String lecture = new String(ch, start, length);
355                 if (_buffer != null)
356                         _buffer.append(lecture);
357         }
358
359         public void startDocument() throws SAXException {
360         }
361
362         public void endDocument() throws SAXException {
363                 postProcess();
364         }
365
366         // Discarding stacking interactions...
367         private void discardStacking() {
368                 Vector<BPTemp> result = new Vector<BPTemp>();
369                 for (int i = 0; i < _molecules.get(_currentModel)._structure.size(); i++) {
370                         BPTemp bp = _molecules.get(_currentModel)._structure.get(i);
371                         if (bp.orientation.equals("c") || bp.orientation.equals("t")) {
372                                 result.add(bp);
373                         }
374                 }
375                 _molecules.get(_currentModel)._structure = result;
376         }
377
378         public static boolean isSelfCrossing(int[] str) {
379                 Stack<Point> intervals = new Stack<Point>();
380                 intervals.add(new Point(0, str.length - 1));
381                 while (!intervals.empty()) {
382                         Point p = intervals.pop();
383                         if (p.x <= p.y) {
384                                 if (str[p.x] == -1) {
385                                         intervals.push(new Point(p.x + 1, p.y));
386                                 } else {
387                                         int i = p.x;
388                                         int j = p.y;
389                                         int k = str[i];
390                                         if ((k <= i) || (k > j)) {
391                                                 return true;
392                                         } else {
393                                                 intervals.push(new Point(i + 1, k - 1));
394                                                 intervals.push(new Point(k + 1, j));
395                                         }
396                                 }
397                         }
398                 }
399                 return false;
400         }
401
402         @SuppressWarnings("unused")
403         private void debugPrintArray(Object[] str) {
404                 StringBuffer s = new StringBuffer("[");
405                 for (int i = 0; i < str.length; i++) {
406                         if (i != 0) {
407                                 s.append(",");
408                         }
409                         s.append(str[i]);
410
411                 }
412                 s.append("]");
413                 System.out.println(s.toString());
414         }
415
416         /**
417          * Computes and returns a maximal planar subset of the current structure.
418          * 
419          * @param str
420          *            A sequence of base-pairing positions
421          * @return A sequence of non-crossing base-pairing positions
422          */
423
424         public static int[] planarize(int[] str) {
425                 if (!isSelfCrossing(str)) {
426                         return str;
427                 }
428
429                 int length = str.length;
430
431                 int[] result = new int[length];
432                 for (int i = 0; i < result.length; i++) {
433                         result[i] = -1;
434                 }
435
436                 short[][] tab = new short[length][length];
437                 short[][] backtrack = new short[length][length];
438                 int theta = 3;
439
440                 for (int i = 0; i < result.length; i++) {
441                         for (int j = i; j < Math.min(i + theta, result.length); j++) {
442                                 tab[i][j] = 0;
443                                 backtrack[i][j] = -1;
444                         }
445                 }
446                 for (int n = theta; n < length; n++) {
447                         for (int i = 0; i < length - n; i++) {
448                                 int j = i + n;
449                                 tab[i][j] = tab[i + 1][j];
450                                 backtrack[i][j] = -1;
451                                 int k = str[i];
452                                 if ((k != -1) && (k <= j) && (i < k)) {
453                                         int tmp = 1;
454                                         if (i + 1 <= k - 1) {
455                                                 tmp += tab[i + 1][k - 1];
456                                         }
457                                         if (k + 1 <= j) {
458                                                 tmp += tab[k + 1][j];
459                                         }
460                                         if (tmp > tab[i][j]) {
461                                                 tab[i][j] = (short) tmp;
462                                                 backtrack[i][j] = (short) k;
463                                         }
464                                 }
465                         }
466                 }
467                 Stack<Point> intervals = new Stack<Point>();
468                 intervals.add(new Point(0, length - 1));
469                 while (!intervals.empty()) {
470                         Point p = intervals.pop();
471                         if (p.x <= p.y) {
472                                 if (backtrack[p.x][p.y] == -1) {
473                                         result[p.x] = -1;
474                                         intervals.push(new Point(p.x + 1, p.y));
475                                 } else {
476                                         int i = p.x;
477                                         int j = p.y;
478                                         int k = backtrack[p.x][p.y];
479                                         result[i] = k;
480                                         result[k] = i;
481                                         intervals.push(new Point(i + 1, k - 1));
482                                         intervals.push(new Point(k + 1, j));
483                                 }
484                         }
485                 }
486                 return result;
487         }
488
489         public static void planarize(ArrayList<ModeleBP> input,
490                         ArrayList<ModeleBP> planar, ArrayList<ModeleBP> others, int length) {
491                 // System.err.println("Planarize: Length:"+length);
492                 Hashtable<Integer, ArrayList<ModeleBP>> index2BPs = new Hashtable<Integer, ArrayList<ModeleBP>>();
493                 for (ModeleBP msbp : input) {
494                         int i = msbp.getPartner5().getIndex();
495                         if (!index2BPs.containsKey(i)) {
496                                 index2BPs.put(i, new ArrayList<ModeleBP>());
497                         }
498                         index2BPs.get(i).add(msbp);
499                 }
500                 // System.err.println(index2BPs);
501
502                 short[][] tab = new short[length][length];
503                 short[][] backtrack = new short[length][length];
504                 int theta = 3;
505
506                 for (int i = 0; i < length; i++) {
507                         for (int j = i; j < Math.min(i + theta, length); j++) {
508                                 tab[i][j] = 0;
509                                 backtrack[i][j] = -1;
510                         }
511                 }
512                 for (int n = theta; n < length; n++) {
513                         for (int i = 0; i < length - n; i++) {
514                                 int j = i + n;
515                                 tab[i][j] = tab[i + 1][j];
516                                 backtrack[i][j] = -1;
517                                 if (index2BPs.containsKey(i)) {
518                                         ArrayList<ModeleBP> vi = index2BPs.get(i);
519                                         // System.err.print(".");
520                                         for (int numBP = 0; numBP < vi.size(); numBP++) {
521                                                 ModeleBP mb = vi.get(numBP);
522                                                 int k = mb.getPartner3().getIndex();
523                                                 if ((k != -1) && (k <= j) && (i < k)) {
524                                                         int tmp = 1;
525                                                         if (i + 1 <= k - 1) {
526                                                                 tmp += tab[i + 1][k - 1];
527                                                         }
528                                                         if (k + 1 <= j) {
529                                                                 tmp += tab[k + 1][j];
530                                                         }
531                                                         if (tmp > tab[i][j]) {
532                                                                 tab[i][j] = (short) tmp;
533                                                                 backtrack[i][j] = (short) numBP;
534                                                         }
535                                                 }
536                                         }
537                                 }
538                         }
539                 }
540                 // System.err.println("DP table: "+tab[0][length-1]);
541
542                 // Backtracking
543                 Stack<Point> intervals = new Stack<Point>();
544                 intervals.add(new Point(0, length - 1));
545                 while (!intervals.empty()) {
546                         Point p = intervals.pop();
547                         if (p.x <= p.y) {
548                                 if (backtrack[p.x][p.y] == -1) {
549                                         intervals.push(new Point(p.x + 1, p.y));
550                                 } else {
551                                         int i = p.x;
552                                         int j = p.y;
553                                         int nb = backtrack[p.x][p.y];
554                                         ModeleBP mb = index2BPs.get(i).get(nb);
555                                         int k = mb.getPartner3().getIndex();
556                                         planar.add(mb);
557                                         intervals.push(new Point(i + 1, k - 1));
558                                         intervals.push(new Point(k + 1, j));
559                                 }
560                         }
561                 }
562
563                 // Remaining base pairs
564                 for (int i : index2BPs.keySet()) {
565                         ArrayList<ModeleBP> vi = index2BPs.get(i);
566                         for (ModeleBP mb : vi) {
567                                 if (!planar.contains(mb)) {
568                                         others.add(mb);
569                                 }
570                         }
571                 }
572         }
573
574         private void postProcess() {
575                 for (RNATmp r : _molecules.values()) {
576                         // First, check if base numbers were specified
577                         if (r._sequenceIDs.size() == 0) {
578                                 Vector<Integer> results = new Vector<Integer>();
579                                 for (int i = 0; i < r._sequence.size(); i++) {
580                                         results.add(new Integer(i + 1));
581                                 }
582                                 r._sequenceIDs = results;
583                         }
584                         // System.err.println("IDs: "+_sequenceIDs);
585                         // System.err.println("Before remapping: "+_structure);
586
587                         // Then, build inverse mapping ID => index
588                         Hashtable<Integer, Integer> ID2Index = new Hashtable<Integer, Integer>();
589                         for (int i = 0; i < r._sequenceIDs.size(); i++) {
590                                 ID2Index.put(r._sequenceIDs.get(i), i);
591                         }
592
593                         // Translate BP coordinates into indices
594                         for (BPTemp bp : r._structure) {
595                                 bp.pos3 = bp.pos3 - 1;
596                                 bp.pos5 = bp.pos5 - 1;
597                         }
598                         // System.err.println("After remapping: "+_structure);
599
600                         discardStacking();
601                         // System.err.println("  Discard stacking (length="+r._sequence.size()+") => "+r._structure);
602
603                         // Eliminate redundancy
604                         Hashtable<Integer, Hashtable<Integer,BPTemp>> index2BPs = new Hashtable<Integer, Hashtable<Integer,BPTemp>>();
605                         for (BPTemp msbp : r._structure) {
606                                 int i = msbp.pos5;
607                                 if (!index2BPs.containsKey(i)) {
608                                         index2BPs.put(i, new Hashtable<Integer,BPTemp>());
609                                 }
610                                 if (!index2BPs.get(i).contains(msbp.pos3)) {
611                                         index2BPs.get(i).put(msbp.pos3,msbp);
612                                 }
613                         }
614                         
615                         // Adding helices...
616                         for (int i = 0; i < r._helices.size(); i++) {
617                                 HelixTemp h = r._helices.get(i);
618                                 for (int j = 0; j < h.length; j++) {
619                                         // System.err.println("Looking for residues: "+(h.pos5+j-1)+" and "+(h.pos3-j-1));
620                                         int a = (h.pos5 + j - 1);
621                                         int b = (h.pos3 - j - 1);
622                                         BPTemp bp = new BPTemp(a, (b), "+", "+", "c");
623                                         if (!index2BPs.containsKey(a)) {
624                                                 index2BPs.put(a, new Hashtable<Integer,BPTemp>());
625                                         }
626                                         if (!index2BPs.get(a).contains(b)) {
627                                                 index2BPs.get(a).put(b,bp);
628                                         }
629                                 }
630                         }
631
632                         Vector<BPTemp> newStructure = new Vector<BPTemp>();
633                         for (int i : index2BPs.keySet()) {
634                                 for (int j : index2BPs.get(i).keySet()) {
635                                         BPTemp bp = index2BPs.get(i).get(j);
636                                         newStructure.add(bp);
637                                 }
638                         }
639                         r._structure = newStructure;
640
641                         // System.err.println("After Helices => "+_structure);
642
643
644                         // System.err.println("After Postprocess => "+_structure);
645                 }
646         }
647
648         public ArrayList<RNATmp> getMolecules() {
649                 return new ArrayList<RNATmp>(_molecules.values());
650         }
651 }