Merge branch 'develop' into feature/r2_11_2/JAL-3808_gff2_exonerate feature/r2_11_2/JAL-3808_gff2_exonerate
authorJim Procter <j.procter@dundee.ac.uk>
Wed, 16 Feb 2022 12:17:20 +0000 (12:17 +0000)
committerJim Procter <j.procter@dundee.ac.uk>
Wed, 16 Feb 2022 12:17:20 +0000 (12:17 +0000)
examples/testdata/test_cdna2genome_showquerygff.gff2 [new file with mode: 0644]
examples/testdata/test_cdna2genome_showtargetgff.gff2 [new file with mode: 0644]
examples/testdata/test_coding2genome_showquerygff.gff2 [new file with mode: 0644]
examples/testdata/test_coding2genome_showtargetgff.gff2 [new file with mode: 0644]
src/jalview/io/gff/ExonerateHelper.java
test/jalview/io/FeaturesFileTest.java

diff --git a/examples/testdata/test_cdna2genome_showquerygff.gff2 b/examples/testdata/test_cdna2genome_showquerygff.gff2
new file mode 100644 (file)
index 0000000..83983a7
--- /dev/null
@@ -0,0 +1,74 @@
+##gff-version 2
+##source-version exonerate:cdna2genome 2.4.0
+##date 2021-02-03
+##type DNA
+#
+#
+# seqname source feature start end score strand frame attributes
+#
+CDS|ENST00000398721/1-183      exonerate:cdna2genome   similarity      1       183     885     +       .       alignment_id 0 ; Target ENSG00000214643/1-4245 ; Align 1 822 59 ; Align 60 4039 124
+##FASTA
+>ENSG00000214643/1-4245 chromosome:GRCh38:6:49946021:49950265:-1
+CTCCTGCCTCAGCCTCTGGGAGTAGCTGGGACTACAGGGGCCCGCCACCACGCCCGGATAATTTTTTTCTAT
+TTTTTAGTAGAGACGGGGTTTCACCGTGTTAGCCGGGACGGTCTTAATCTCCTGACCTAGTTATCCGCCAGC
+CTCGGCCTCCCAAAGTGCTGGAATTACGGGTGTGAGTATATTGCTTTTTAAATTCACTAGTTTATTCATTAT
+GTATAGCTATTTAAAAAAGAGAAAAACTGTCCTGGCTAACACGGTGAAACCCCGTTTCTACTAAAAATACAA
+AAAATTAGCCGGCCTGGTGGCGGGTGCCTGTAGTACCACCTACTCTGGAGGCTGAGGCAGGAGAATGGCGTG
+AACCCGGGAGGCAGAGCTTGCAGTGAGTCGCGATCGCGCCACTGAACTCCAGCCTGGGTGACAGAACGAGAC
+TCCGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAGAGAGAGAGAGAGAGAGAGAGAGAAACTAAATCAAAAG
+CAAAGATTAAATTCTCATGCTTCTACTTTTCAGACCTTCAATAGGAGATTCTCAGTGGCATGCACTTCCTAG
+ACAACCAGTTATTAAAAAGATAATTATATACTATGGATATCATAACCCTTTATATCCCAAGATAAAAGAAAA
+GTAATGTCCTGCATTTTTGCCCCAGGCACTGCTAAACAGGAGTGTTAACTTTTGATAAACCCCAGTTTCTAA
+ATTCGTACAACTGCATGGAGAGAAGTATAAAAAGATGTGCTCTCCACCTTTCTCTTATTCAAACACTGCAGA
+GCTGCTAATTCGGCTGACTGTCTTCAATCATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGG
+TCCCCATTGCCACCAGGTAAAATGTAAATATTGAGAGGGTAAAATATCTTTCTGGAGAGTTGCATTTTGGTC
+GATGTCAACTGGAGACACAAGAGTGACCAGATTGTTAAGAGAGTGGTTGGTTACGGTCCCAAGAGTAAATCA
+CAAATAAAGGACTGTGGTCTTACTAAACAGTAAGGCTCCGGGTGAGTCATATTATTGTATGAGACAATTTAA
+AGGCCTTGATTAAATGACTACAGGCTGGTTGTCCAGACTGACTACTTGGATAGAATATTAAAGTATAACCAT
+CTGGATCCTTGAGACATTGATGGGCACTGTGTAGTAGAGTAACAGTTGGTTTTTCAGTAACCTCATGATAGG
+TTTGGTTATATTATGTCACAAGGTTTACTTTGAGAAAAACACAAAATGGTGTTTTCAACTGATTAAGAATTG
+AGAACTTCAGTGGCTGTATTACCTCAGGTCCAATTACATAGCAATTAATACTAACCAGATAATTAAATGAGT
+ATAAACTTTGGAATAATTGATACCTGGGTTCAGATTTGGCTTTTGCCATGTATTGGCTATGCAAATCTATCA
+GGTTATTTACACTCATTCAGCATCAATTTCCTCATTTGCGAATTAAATACAGTAATTAAGTCTATCTGTAGT
+ATTATGACGATTAATGATATTTAAATATACAAAATGCCCAACATAAAGTATAATACACATCAGATTCATCTT
+AAATAGTATTATAATTACTTTTACATTGTATTAAAACACCTTAAAAAGCAAGTAACATCTAGAAATCTCATC
+GAGAAACTCTTTCTTGACAACTATTGCAGGAAACAGGGGAGGTAATATTTACCATGTTGGATTTCACATACT
+TAAACTGTGTACCCTCATTTGTAAACTTTCATTGTACACACTACTATTAGATTAGATCAATAGCCAAAGGAA
+GTATCTTGATGGAGATCATCGAAAGAGCTGTTTTAACACATTTCAGGGTACTTATGATAGGAAAAGTATTTT
+TCTTATTTATGTAAAGACCCTGTAAACAATCATTTTACACACACCCAAGAAAATAATTCCACTCACCTTAAG
+AAAATAAAACAAACAAAAAAATAATAACTATAGGGCATTCTGATTTAGGAGTGGTCTAGAACTGCAAGAGTT
+TCCTCTTTGAAGTCCAACTTCATTCCTTAGCAGCACAGATGAGACCAAGGAAGTCTAGCTCTAACAATAAGT
+CTGGCTGTGAGTAAACTCAAACATCTAAGTATTTTGGATAAAAGACTCAAATCATTTTAATTGTTTTATAAT
+TCAAGTAAAATATGTCAGATGATAATTTTGAGGATTGTTATTTTATTACATCATTCTAAACTCAAGACTAAT
+GGAATTACAATCAGGGTTTGTTTTTACTAAAACCATATGGGGGTGATCTTAAAATAAAGAAGACTTATTTAG
+TGCAACCACATTTGGATAATAAAATTAGCTCCAGTTATGTTATCATGATTTACATGTGTGCCTCATGTCACT
+TGCTTTGGGAAAAAAGGTGAAAATTTTCATGACATTTTCAGTTTCATTATAAACCCAAGAGACATTATGAAT
+GTATGTGCATAGATTCATCTAAAAATTGAGAAATTTGTGATTCCTTCTTGTACATAAATAAATTTTTATGTT
+ATACATAGCTGTTATCTAGAGAGAAGCTGTTTGTTGTTTTTGTTACTAGTGGAAAAATGCTAATTTACAATT
+CCCACTTAAATAATGCTATTCTTTTTTTAATACTAGTGTTTGACAGCTGTCATCTTCTATTATTTGTAAATT
+CCAATTAAGTCAACATACATGCTTCACACCCATCATGTAGGACAGTGTAATCTGAGGATGCATTTTTAAGCT
+GGCAAGATAATGCTTTTTCCCTCTGGCATCAAAGGCAATAAATAGCATCTCTCTAGATAATACAAATAAAGA
+GATATCTATCATTAATTTATACAGTGAAAAAGCATATCTCGTTGCCTAAAATATTTGGCAAAAAATTCAAAA
+TAATATGTTCAATTTTTTCCTAAAATGTCTAGTTACTCCAAGTAACAGTAAGACTGTATGCTAACATAACTT
+CACTGAAAGCAGATGACATGAATTGGGTTATCAGTACCTTTGACTCCATCTCAGTTGGATCCTATATGTTGA
+GCACCTAGAGAAAGAGCACCTTTCACATACCTTTGTCACTGATATTTGATAATTTACCATTACATTTAATGC
+TTTAACTTTGGGCGTCTTCCTCCTTATGTGACCTTTCTAAAAATTTAAACATTTTGTGAAGAGATGAAACAT
+TAAGTCTTATGGCCAGATGTAGATGAGTTCATCAGCTGGTATATGCTATGATGATGCAATCCTGAGAACTTT
+ATAAAGGAGAAGACTCTGAAACAAGGAAACATACGGAGATGCTGAGAGATACTACCTCTTTCTCTTTGAACT
+TAAGGCTTGTCATTCATGACAACACTCATCCCCCCAGCAAACCAGGCTCCTGCCTAGGCTCAAATCTTTGTG
+AATATTCAAAGAACCTCTCTTATCCCAGTATCTGCTAGGGTTTTTTTCACTCTTCTAATAAAAGAGATTTAT
+AAAAAATATTCGAGCAAGATAGCAATTGAAGCAACATTAATGAATCTTCCCATTTTAAATCCTGAAACCAAA
+AATGTTTAAATGTTTAATTTTTTTTTTGAAATTGGAGAAAAAAGCCTTTGGGAGGTGATGGGATCATAGATA
+TTTGTATATGCCCAAATTCATCATTTGTACACATTAAATATGTGCAGTTCTTTTTATAGCAACTATACCTCA
+ATAAAGTTGTTTTTTTAAAAAAATCATGTTAGGTGGGTAGTAGGTAAGTAGCCATAGAAATATGTGGTGTTC
+ATTCCAGAACTACTTGAAATCTAACAATTTTTATTTAGAAATTCTCAGGGAAAACTAAGCTTTAACAACTAG
+TGGATCAAGCCAGAAGTTGGAGAGGAAACTGAGAATGAAACAGAAAGTATCACAGAACAATCATCACCAGAA
+AAATACAAAACAAATGACTTCCCAAATCCCTACCCCAGAATGCAAGGGCTATACTTATTTTCACATTGACAA
+CTGATTTATCCCAAGGTGGGACCTTAAGACACTTATAGTGACTCAATGGTTTATGTGCTTTCTTTTTCCCCT
+TTTAACAGTGAAATGTGCCGTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTG
+TCATGATTTTGAAAAACCAATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATGTAGAAGTTTGAAAT
+AAAAATCAGCATCCCTCAAAAGTAAAGCACAGTGAGTACAAGGATTAGTAATAAAAGAAACTGAATTAC
+>CDS|ENST00000398721/1-183 DEFB133-201
+ATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGGTCCCCATTGCCACCAGGGTGAAATGTGCC
+GTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTGTCATGATTTTGAAAAACCA
+ATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATG
diff --git a/examples/testdata/test_cdna2genome_showtargetgff.gff2 b/examples/testdata/test_cdna2genome_showtargetgff.gff2
new file mode 100644 (file)
index 0000000..0a5cb2d
--- /dev/null
@@ -0,0 +1,81 @@
+##gff-version 2
+##source-version exonerate:cdna2genome 2.4.0
+##date 2021-02-03
+##type DNA
+#
+#
+# seqname source feature start end score strand frame attributes
+#
+ENSG00000214643/1-4245 exonerate:cdna2genome   gene    822     4162    885     +       .       gene_id 0 ; sequence CDS|ENST00000398721/1-183 ; gene_orientation + ; identity 99.45 ; similarity 99.45
+ENSG00000214643/1-4245 exonerate:cdna2genome   utr5    822     880     .       +       .       
+ENSG00000214643/1-4245 exonerate:cdna2genome   exon    822     880     .       +       .       insertions 0 ; deletions 0 ; identity 98.33 ; similarity 98.33
+ENSG00000214643/1-4245 exonerate:cdna2genome   splice5 881     882     .       +       .       intron_id 1 ; splice_site "GT"
+ENSG00000214643/1-4245 exonerate:cdna2genome   intron  881     4038    .       +       .       intron_id 1
+ENSG00000214643/1-4245 exonerate:cdna2genome   splice3 4037    4038    .       +       .       intron_id 0 ; splice_site "AC"
+ENSG00000214643/1-4245 exonerate:cdna2genome   exon    4039    4162    .       +       .       insertions 0 ; deletions 0 ; identity 99.19 ; similarity 99.19
+ENSG00000214643/1-4245 exonerate:cdna2genome   similarity      822     4162    885     +       .       alignment_id 0 ; Query CDS|ENST00000398721/1-183 ; Align 822 1 59 ; Align 4039 60 124
+##FASTA
+>ENSG00000214643/1-4245 chromosome:GRCh38:6:49946021:49950265:-1
+CTCCTGCCTCAGCCTCTGGGAGTAGCTGGGACTACAGGGGCCCGCCACCACGCCCGGATAATTTTTTTCTAT
+TTTTTAGTAGAGACGGGGTTTCACCGTGTTAGCCGGGACGGTCTTAATCTCCTGACCTAGTTATCCGCCAGC
+CTCGGCCTCCCAAAGTGCTGGAATTACGGGTGTGAGTATATTGCTTTTTAAATTCACTAGTTTATTCATTAT
+GTATAGCTATTTAAAAAAGAGAAAAACTGTCCTGGCTAACACGGTGAAACCCCGTTTCTACTAAAAATACAA
+AAAATTAGCCGGCCTGGTGGCGGGTGCCTGTAGTACCACCTACTCTGGAGGCTGAGGCAGGAGAATGGCGTG
+AACCCGGGAGGCAGAGCTTGCAGTGAGTCGCGATCGCGCCACTGAACTCCAGCCTGGGTGACAGAACGAGAC
+TCCGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAGAGAGAGAGAGAGAGAGAGAGAGAAACTAAATCAAAAG
+CAAAGATTAAATTCTCATGCTTCTACTTTTCAGACCTTCAATAGGAGATTCTCAGTGGCATGCACTTCCTAG
+ACAACCAGTTATTAAAAAGATAATTATATACTATGGATATCATAACCCTTTATATCCCAAGATAAAAGAAAA
+GTAATGTCCTGCATTTTTGCCCCAGGCACTGCTAAACAGGAGTGTTAACTTTTGATAAACCCCAGTTTCTAA
+ATTCGTACAACTGCATGGAGAGAAGTATAAAAAGATGTGCTCTCCACCTTTCTCTTATTCAAACACTGCAGA
+GCTGCTAATTCGGCTGACTGTCTTCAATCATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGG
+TCCCCATTGCCACCAGGTAAAATGTAAATATTGAGAGGGTAAAATATCTTTCTGGAGAGTTGCATTTTGGTC
+GATGTCAACTGGAGACACAAGAGTGACCAGATTGTTAAGAGAGTGGTTGGTTACGGTCCCAAGAGTAAATCA
+CAAATAAAGGACTGTGGTCTTACTAAACAGTAAGGCTCCGGGTGAGTCATATTATTGTATGAGACAATTTAA
+AGGCCTTGATTAAATGACTACAGGCTGGTTGTCCAGACTGACTACTTGGATAGAATATTAAAGTATAACCAT
+CTGGATCCTTGAGACATTGATGGGCACTGTGTAGTAGAGTAACAGTTGGTTTTTCAGTAACCTCATGATAGG
+TTTGGTTATATTATGTCACAAGGTTTACTTTGAGAAAAACACAAAATGGTGTTTTCAACTGATTAAGAATTG
+AGAACTTCAGTGGCTGTATTACCTCAGGTCCAATTACATAGCAATTAATACTAACCAGATAATTAAATGAGT
+ATAAACTTTGGAATAATTGATACCTGGGTTCAGATTTGGCTTTTGCCATGTATTGGCTATGCAAATCTATCA
+GGTTATTTACACTCATTCAGCATCAATTTCCTCATTTGCGAATTAAATACAGTAATTAAGTCTATCTGTAGT
+ATTATGACGATTAATGATATTTAAATATACAAAATGCCCAACATAAAGTATAATACACATCAGATTCATCTT
+AAATAGTATTATAATTACTTTTACATTGTATTAAAACACCTTAAAAAGCAAGTAACATCTAGAAATCTCATC
+GAGAAACTCTTTCTTGACAACTATTGCAGGAAACAGGGGAGGTAATATTTACCATGTTGGATTTCACATACT
+TAAACTGTGTACCCTCATTTGTAAACTTTCATTGTACACACTACTATTAGATTAGATCAATAGCCAAAGGAA
+GTATCTTGATGGAGATCATCGAAAGAGCTGTTTTAACACATTTCAGGGTACTTATGATAGGAAAAGTATTTT
+TCTTATTTATGTAAAGACCCTGTAAACAATCATTTTACACACACCCAAGAAAATAATTCCACTCACCTTAAG
+AAAATAAAACAAACAAAAAAATAATAACTATAGGGCATTCTGATTTAGGAGTGGTCTAGAACTGCAAGAGTT
+TCCTCTTTGAAGTCCAACTTCATTCCTTAGCAGCACAGATGAGACCAAGGAAGTCTAGCTCTAACAATAAGT
+CTGGCTGTGAGTAAACTCAAACATCTAAGTATTTTGGATAAAAGACTCAAATCATTTTAATTGTTTTATAAT
+TCAAGTAAAATATGTCAGATGATAATTTTGAGGATTGTTATTTTATTACATCATTCTAAACTCAAGACTAAT
+GGAATTACAATCAGGGTTTGTTTTTACTAAAACCATATGGGGGTGATCTTAAAATAAAGAAGACTTATTTAG
+TGCAACCACATTTGGATAATAAAATTAGCTCCAGTTATGTTATCATGATTTACATGTGTGCCTCATGTCACT
+TGCTTTGGGAAAAAAGGTGAAAATTTTCATGACATTTTCAGTTTCATTATAAACCCAAGAGACATTATGAAT
+GTATGTGCATAGATTCATCTAAAAATTGAGAAATTTGTGATTCCTTCTTGTACATAAATAAATTTTTATGTT
+ATACATAGCTGTTATCTAGAGAGAAGCTGTTTGTTGTTTTTGTTACTAGTGGAAAAATGCTAATTTACAATT
+CCCACTTAAATAATGCTATTCTTTTTTTAATACTAGTGTTTGACAGCTGTCATCTTCTATTATTTGTAAATT
+CCAATTAAGTCAACATACATGCTTCACACCCATCATGTAGGACAGTGTAATCTGAGGATGCATTTTTAAGCT
+GGCAAGATAATGCTTTTTCCCTCTGGCATCAAAGGCAATAAATAGCATCTCTCTAGATAATACAAATAAAGA
+GATATCTATCATTAATTTATACAGTGAAAAAGCATATCTCGTTGCCTAAAATATTTGGCAAAAAATTCAAAA
+TAATATGTTCAATTTTTTCCTAAAATGTCTAGTTACTCCAAGTAACAGTAAGACTGTATGCTAACATAACTT
+CACTGAAAGCAGATGACATGAATTGGGTTATCAGTACCTTTGACTCCATCTCAGTTGGATCCTATATGTTGA
+GCACCTAGAGAAAGAGCACCTTTCACATACCTTTGTCACTGATATTTGATAATTTACCATTACATTTAATGC
+TTTAACTTTGGGCGTCTTCCTCCTTATGTGACCTTTCTAAAAATTTAAACATTTTGTGAAGAGATGAAACAT
+TAAGTCTTATGGCCAGATGTAGATGAGTTCATCAGCTGGTATATGCTATGATGATGCAATCCTGAGAACTTT
+ATAAAGGAGAAGACTCTGAAACAAGGAAACATACGGAGATGCTGAGAGATACTACCTCTTTCTCTTTGAACT
+TAAGGCTTGTCATTCATGACAACACTCATCCCCCCAGCAAACCAGGCTCCTGCCTAGGCTCAAATCTTTGTG
+AATATTCAAAGAACCTCTCTTATCCCAGTATCTGCTAGGGTTTTTTTCACTCTTCTAATAAAAGAGATTTAT
+AAAAAATATTCGAGCAAGATAGCAATTGAAGCAACATTAATGAATCTTCCCATTTTAAATCCTGAAACCAAA
+AATGTTTAAATGTTTAATTTTTTTTTTGAAATTGGAGAAAAAAGCCTTTGGGAGGTGATGGGATCATAGATA
+TTTGTATATGCCCAAATTCATCATTTGTACACATTAAATATGTGCAGTTCTTTTTATAGCAACTATACCTCA
+ATAAAGTTGTTTTTTTAAAAAAATCATGTTAGGTGGGTAGTAGGTAAGTAGCCATAGAAATATGTGGTGTTC
+ATTCCAGAACTACTTGAAATCTAACAATTTTTATTTAGAAATTCTCAGGGAAAACTAAGCTTTAACAACTAG
+TGGATCAAGCCAGAAGTTGGAGAGGAAACTGAGAATGAAACAGAAAGTATCACAGAACAATCATCACCAGAA
+AAATACAAAACAAATGACTTCCCAAATCCCTACCCCAGAATGCAAGGGCTATACTTATTTTCACATTGACAA
+CTGATTTATCCCAAGGTGGGACCTTAAGACACTTATAGTGACTCAATGGTTTATGTGCTTTCTTTTTCCCCT
+TTTAACAGTGAAATGTGCCGTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTG
+TCATGATTTTGAAAAACCAATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATGTAGAAGTTTGAAAT
+AAAAATCAGCATCCCTCAAAAGTAAAGCACAGTGAGTACAAGGATTAGTAATAAAAGAAACTGAATTAC
+>CDS|ENST00000398721/1-183 DEFB133-201
+ATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGGTCCCCATTGCCACCAGGGTGAAATGTGCC
+GTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTGTCATGATTTTGAAAAACCA
+ATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATG
diff --git a/examples/testdata/test_coding2genome_showquerygff.gff2 b/examples/testdata/test_coding2genome_showquerygff.gff2
new file mode 100644 (file)
index 0000000..0466da8
--- /dev/null
@@ -0,0 +1,76 @@
+##gff-version 2
+##source-version exonerate:coding2genome 2.4.0
+##date 2021-02-03
+##type DNA
+#
+#
+# seqname source feature start end score strand frame attributes
+#
+CDS|ENST00000398721/1-183      exonerate:coding2genome similarity      1       183     322     +       .       alignment_id 0 ; Target ENSG00000214643/1-4245 ; Align 1 822 57 ; Align 61 4040 123
+# --- END OF GFF DUMP ---
+#
+##FASTA
+>ENSG00000214643/1-4245 chromosome:GRCh38:6:49946021:49950265:-1
+CTCCTGCCTCAGCCTCTGGGAGTAGCTGGGACTACAGGGGCCCGCCACCACGCCCGGATAATTTTTTTCTAT
+TTTTTAGTAGAGACGGGGTTTCACCGTGTTAGCCGGGACGGTCTTAATCTCCTGACCTAGTTATCCGCCAGC
+CTCGGCCTCCCAAAGTGCTGGAATTACGGGTGTGAGTATATTGCTTTTTAAATTCACTAGTTTATTCATTAT
+GTATAGCTATTTAAAAAAGAGAAAAACTGTCCTGGCTAACACGGTGAAACCCCGTTTCTACTAAAAATACAA
+AAAATTAGCCGGCCTGGTGGCGGGTGCCTGTAGTACCACCTACTCTGGAGGCTGAGGCAGGAGAATGGCGTG
+AACCCGGGAGGCAGAGCTTGCAGTGAGTCGCGATCGCGCCACTGAACTCCAGCCTGGGTGACAGAACGAGAC
+TCCGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAGAGAGAGAGAGAGAGAGAGAGAGAAACTAAATCAAAAG
+CAAAGATTAAATTCTCATGCTTCTACTTTTCAGACCTTCAATAGGAGATTCTCAGTGGCATGCACTTCCTAG
+ACAACCAGTTATTAAAAAGATAATTATATACTATGGATATCATAACCCTTTATATCCCAAGATAAAAGAAAA
+GTAATGTCCTGCATTTTTGCCCCAGGCACTGCTAAACAGGAGTGTTAACTTTTGATAAACCCCAGTTTCTAA
+ATTCGTACAACTGCATGGAGAGAAGTATAAAAAGATGTGCTCTCCACCTTTCTCTTATTCAAACACTGCAGA
+GCTGCTAATTCGGCTGACTGTCTTCAATCATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGG
+TCCCCATTGCCACCAGGTAAAATGTAAATATTGAGAGGGTAAAATATCTTTCTGGAGAGTTGCATTTTGGTC
+GATGTCAACTGGAGACACAAGAGTGACCAGATTGTTAAGAGAGTGGTTGGTTACGGTCCCAAGAGTAAATCA
+CAAATAAAGGACTGTGGTCTTACTAAACAGTAAGGCTCCGGGTGAGTCATATTATTGTATGAGACAATTTAA
+AGGCCTTGATTAAATGACTACAGGCTGGTTGTCCAGACTGACTACTTGGATAGAATATTAAAGTATAACCAT
+CTGGATCCTTGAGACATTGATGGGCACTGTGTAGTAGAGTAACAGTTGGTTTTTCAGTAACCTCATGATAGG
+TTTGGTTATATTATGTCACAAGGTTTACTTTGAGAAAAACACAAAATGGTGTTTTCAACTGATTAAGAATTG
+AGAACTTCAGTGGCTGTATTACCTCAGGTCCAATTACATAGCAATTAATACTAACCAGATAATTAAATGAGT
+ATAAACTTTGGAATAATTGATACCTGGGTTCAGATTTGGCTTTTGCCATGTATTGGCTATGCAAATCTATCA
+GGTTATTTACACTCATTCAGCATCAATTTCCTCATTTGCGAATTAAATACAGTAATTAAGTCTATCTGTAGT
+ATTATGACGATTAATGATATTTAAATATACAAAATGCCCAACATAAAGTATAATACACATCAGATTCATCTT
+AAATAGTATTATAATTACTTTTACATTGTATTAAAACACCTTAAAAAGCAAGTAACATCTAGAAATCTCATC
+GAGAAACTCTTTCTTGACAACTATTGCAGGAAACAGGGGAGGTAATATTTACCATGTTGGATTTCACATACT
+TAAACTGTGTACCCTCATTTGTAAACTTTCATTGTACACACTACTATTAGATTAGATCAATAGCCAAAGGAA
+GTATCTTGATGGAGATCATCGAAAGAGCTGTTTTAACACATTTCAGGGTACTTATGATAGGAAAAGTATTTT
+TCTTATTTATGTAAAGACCCTGTAAACAATCATTTTACACACACCCAAGAAAATAATTCCACTCACCTTAAG
+AAAATAAAACAAACAAAAAAATAATAACTATAGGGCATTCTGATTTAGGAGTGGTCTAGAACTGCAAGAGTT
+TCCTCTTTGAAGTCCAACTTCATTCCTTAGCAGCACAGATGAGACCAAGGAAGTCTAGCTCTAACAATAAGT
+CTGGCTGTGAGTAAACTCAAACATCTAAGTATTTTGGATAAAAGACTCAAATCATTTTAATTGTTTTATAAT
+TCAAGTAAAATATGTCAGATGATAATTTTGAGGATTGTTATTTTATTACATCATTCTAAACTCAAGACTAAT
+GGAATTACAATCAGGGTTTGTTTTTACTAAAACCATATGGGGGTGATCTTAAAATAAAGAAGACTTATTTAG
+TGCAACCACATTTGGATAATAAAATTAGCTCCAGTTATGTTATCATGATTTACATGTGTGCCTCATGTCACT
+TGCTTTGGGAAAAAAGGTGAAAATTTTCATGACATTTTCAGTTTCATTATAAACCCAAGAGACATTATGAAT
+GTATGTGCATAGATTCATCTAAAAATTGAGAAATTTGTGATTCCTTCTTGTACATAAATAAATTTTTATGTT
+ATACATAGCTGTTATCTAGAGAGAAGCTGTTTGTTGTTTTTGTTACTAGTGGAAAAATGCTAATTTACAATT
+CCCACTTAAATAATGCTATTCTTTTTTTAATACTAGTGTTTGACAGCTGTCATCTTCTATTATTTGTAAATT
+CCAATTAAGTCAACATACATGCTTCACACCCATCATGTAGGACAGTGTAATCTGAGGATGCATTTTTAAGCT
+GGCAAGATAATGCTTTTTCCCTCTGGCATCAAAGGCAATAAATAGCATCTCTCTAGATAATACAAATAAAGA
+GATATCTATCATTAATTTATACAGTGAAAAAGCATATCTCGTTGCCTAAAATATTTGGCAAAAAATTCAAAA
+TAATATGTTCAATTTTTTCCTAAAATGTCTAGTTACTCCAAGTAACAGTAAGACTGTATGCTAACATAACTT
+CACTGAAAGCAGATGACATGAATTGGGTTATCAGTACCTTTGACTCCATCTCAGTTGGATCCTATATGTTGA
+GCACCTAGAGAAAGAGCACCTTTCACATACCTTTGTCACTGATATTTGATAATTTACCATTACATTTAATGC
+TTTAACTTTGGGCGTCTTCCTCCTTATGTGACCTTTCTAAAAATTTAAACATTTTGTGAAGAGATGAAACAT
+TAAGTCTTATGGCCAGATGTAGATGAGTTCATCAGCTGGTATATGCTATGATGATGCAATCCTGAGAACTTT
+ATAAAGGAGAAGACTCTGAAACAAGGAAACATACGGAGATGCTGAGAGATACTACCTCTTTCTCTTTGAACT
+TAAGGCTTGTCATTCATGACAACACTCATCCCCCCAGCAAACCAGGCTCCTGCCTAGGCTCAAATCTTTGTG
+AATATTCAAAGAACCTCTCTTATCCCAGTATCTGCTAGGGTTTTTTTCACTCTTCTAATAAAAGAGATTTAT
+AAAAAATATTCGAGCAAGATAGCAATTGAAGCAACATTAATGAATCTTCCCATTTTAAATCCTGAAACCAAA
+AATGTTTAAATGTTTAATTTTTTTTTTGAAATTGGAGAAAAAAGCCTTTGGGAGGTGATGGGATCATAGATA
+TTTGTATATGCCCAAATTCATCATTTGTACACATTAAATATGTGCAGTTCTTTTTATAGCAACTATACCTCA
+ATAAAGTTGTTTTTTTAAAAAAATCATGTTAGGTGGGTAGTAGGTAAGTAGCCATAGAAATATGTGGTGTTC
+ATTCCAGAACTACTTGAAATCTAACAATTTTTATTTAGAAATTCTCAGGGAAAACTAAGCTTTAACAACTAG
+TGGATCAAGCCAGAAGTTGGAGAGGAAACTGAGAATGAAACAGAAAGTATCACAGAACAATCATCACCAGAA
+AAATACAAAACAAATGACTTCCCAAATCCCTACCCCAGAATGCAAGGGCTATACTTATTTTCACATTGACAA
+CTGATTTATCCCAAGGTGGGACCTTAAGACACTTATAGTGACTCAATGGTTTATGTGCTTTCTTTTTCCCCT
+TTTAACAGTGAAATGTGCCGTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTG
+TCATGATTTTGAAAAACCAATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATGTAGAAGTTTGAAAT
+AAAAATCAGCATCCCTCAAAAGTAAAGCACAGTGAGTACAAGGATTAGTAATAAAAGAAACTGAATTAC
+>CDS|ENST00000398721/1-183 DEFB133-201
+ATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGGTCCCCATTGCCACCAGGGTGAAATGTGCC
+GTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTGTCATGATTTTGAAAAACCA
+ATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATG
diff --git a/examples/testdata/test_coding2genome_showtargetgff.gff2 b/examples/testdata/test_coding2genome_showtargetgff.gff2
new file mode 100644 (file)
index 0000000..ebb3422
--- /dev/null
@@ -0,0 +1,82 @@
+##gff-version 2
+##source-version exonerate:coding2genome 2.4.0
+##date 2021-02-03
+##type DNA
+#
+#
+# seqname source feature start end score strand frame attributes
+#
+ENSG00000214643/1-4245 exonerate:coding2genome gene    822     4162    322     +       .       gene_id 0 ; sequence CDS|ENST00000398721/1-183 ; gene_orientation + ; identity 100.00 ; similarity 100.00
+ENSG00000214643/1-4245 exonerate:coding2genome cds     822     880     .       +       .       
+ENSG00000214643/1-4245 exonerate:coding2genome exon    822     880     .       +       .       insertions 0 ; deletions 0 ; identity 100.00 ; similarity 100.00
+ENSG00000214643/1-4245 exonerate:coding2genome splice5 881     882     .       +       .       intron_id 1 ; splice_site "GT"
+ENSG00000214643/1-4245 exonerate:coding2genome intron  881     4038    .       +       .       intron_id 1
+ENSG00000214643/1-4245 exonerate:coding2genome splice3 4037    4038    .       +       .       intron_id 0 ; splice_site "AC"
+ENSG00000214643/1-4245 exonerate:coding2genome cds     4039    4162    .       +       .       
+ENSG00000214643/1-4245 exonerate:coding2genome exon    4039    4162    .       +       .       insertions 0 ; deletions 0 ; identity 100.00 ; similarity 100.00
+ENSG00000214643/1-4245 exonerate:coding2genome similarity      822     4162    322     +       .       alignment_id 0 ; Query CDS|ENST00000398721/1-183 ; Align 822 1 57 ; Align 4040 61 123
+##FASTA
+>ENSG00000214643/1-4245 chromosome:GRCh38:6:49946021:49950265:-1
+CTCCTGCCTCAGCCTCTGGGAGTAGCTGGGACTACAGGGGCCCGCCACCACGCCCGGATAATTTTTTTCTAT
+TTTTTAGTAGAGACGGGGTTTCACCGTGTTAGCCGGGACGGTCTTAATCTCCTGACCTAGTTATCCGCCAGC
+CTCGGCCTCCCAAAGTGCTGGAATTACGGGTGTGAGTATATTGCTTTTTAAATTCACTAGTTTATTCATTAT
+GTATAGCTATTTAAAAAAGAGAAAAACTGTCCTGGCTAACACGGTGAAACCCCGTTTCTACTAAAAATACAA
+AAAATTAGCCGGCCTGGTGGCGGGTGCCTGTAGTACCACCTACTCTGGAGGCTGAGGCAGGAGAATGGCGTG
+AACCCGGGAGGCAGAGCTTGCAGTGAGTCGCGATCGCGCCACTGAACTCCAGCCTGGGTGACAGAACGAGAC
+TCCGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAGAGAGAGAGAGAGAGAGAGAGAGAAACTAAATCAAAAG
+CAAAGATTAAATTCTCATGCTTCTACTTTTCAGACCTTCAATAGGAGATTCTCAGTGGCATGCACTTCCTAG
+ACAACCAGTTATTAAAAAGATAATTATATACTATGGATATCATAACCCTTTATATCCCAAGATAAAAGAAAA
+GTAATGTCCTGCATTTTTGCCCCAGGCACTGCTAAACAGGAGTGTTAACTTTTGATAAACCCCAGTTTCTAA
+ATTCGTACAACTGCATGGAGAGAAGTATAAAAAGATGTGCTCTCCACCTTTCTCTTATTCAAACACTGCAGA
+GCTGCTAATTCGGCTGACTGTCTTCAATCATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGG
+TCCCCATTGCCACCAGGTAAAATGTAAATATTGAGAGGGTAAAATATCTTTCTGGAGAGTTGCATTTTGGTC
+GATGTCAACTGGAGACACAAGAGTGACCAGATTGTTAAGAGAGTGGTTGGTTACGGTCCCAAGAGTAAATCA
+CAAATAAAGGACTGTGGTCTTACTAAACAGTAAGGCTCCGGGTGAGTCATATTATTGTATGAGACAATTTAA
+AGGCCTTGATTAAATGACTACAGGCTGGTTGTCCAGACTGACTACTTGGATAGAATATTAAAGTATAACCAT
+CTGGATCCTTGAGACATTGATGGGCACTGTGTAGTAGAGTAACAGTTGGTTTTTCAGTAACCTCATGATAGG
+TTTGGTTATATTATGTCACAAGGTTTACTTTGAGAAAAACACAAAATGGTGTTTTCAACTGATTAAGAATTG
+AGAACTTCAGTGGCTGTATTACCTCAGGTCCAATTACATAGCAATTAATACTAACCAGATAATTAAATGAGT
+ATAAACTTTGGAATAATTGATACCTGGGTTCAGATTTGGCTTTTGCCATGTATTGGCTATGCAAATCTATCA
+GGTTATTTACACTCATTCAGCATCAATTTCCTCATTTGCGAATTAAATACAGTAATTAAGTCTATCTGTAGT
+ATTATGACGATTAATGATATTTAAATATACAAAATGCCCAACATAAAGTATAATACACATCAGATTCATCTT
+AAATAGTATTATAATTACTTTTACATTGTATTAAAACACCTTAAAAAGCAAGTAACATCTAGAAATCTCATC
+GAGAAACTCTTTCTTGACAACTATTGCAGGAAACAGGGGAGGTAATATTTACCATGTTGGATTTCACATACT
+TAAACTGTGTACCCTCATTTGTAAACTTTCATTGTACACACTACTATTAGATTAGATCAATAGCCAAAGGAA
+GTATCTTGATGGAGATCATCGAAAGAGCTGTTTTAACACATTTCAGGGTACTTATGATAGGAAAAGTATTTT
+TCTTATTTATGTAAAGACCCTGTAAACAATCATTTTACACACACCCAAGAAAATAATTCCACTCACCTTAAG
+AAAATAAAACAAACAAAAAAATAATAACTATAGGGCATTCTGATTTAGGAGTGGTCTAGAACTGCAAGAGTT
+TCCTCTTTGAAGTCCAACTTCATTCCTTAGCAGCACAGATGAGACCAAGGAAGTCTAGCTCTAACAATAAGT
+CTGGCTGTGAGTAAACTCAAACATCTAAGTATTTTGGATAAAAGACTCAAATCATTTTAATTGTTTTATAAT
+TCAAGTAAAATATGTCAGATGATAATTTTGAGGATTGTTATTTTATTACATCATTCTAAACTCAAGACTAAT
+GGAATTACAATCAGGGTTTGTTTTTACTAAAACCATATGGGGGTGATCTTAAAATAAAGAAGACTTATTTAG
+TGCAACCACATTTGGATAATAAAATTAGCTCCAGTTATGTTATCATGATTTACATGTGTGCCTCATGTCACT
+TGCTTTGGGAAAAAAGGTGAAAATTTTCATGACATTTTCAGTTTCATTATAAACCCAAGAGACATTATGAAT
+GTATGTGCATAGATTCATCTAAAAATTGAGAAATTTGTGATTCCTTCTTGTACATAAATAAATTTTTATGTT
+ATACATAGCTGTTATCTAGAGAGAAGCTGTTTGTTGTTTTTGTTACTAGTGGAAAAATGCTAATTTACAATT
+CCCACTTAAATAATGCTATTCTTTTTTTAATACTAGTGTTTGACAGCTGTCATCTTCTATTATTTGTAAATT
+CCAATTAAGTCAACATACATGCTTCACACCCATCATGTAGGACAGTGTAATCTGAGGATGCATTTTTAAGCT
+GGCAAGATAATGCTTTTTCCCTCTGGCATCAAAGGCAATAAATAGCATCTCTCTAGATAATACAAATAAAGA
+GATATCTATCATTAATTTATACAGTGAAAAAGCATATCTCGTTGCCTAAAATATTTGGCAAAAAATTCAAAA
+TAATATGTTCAATTTTTTCCTAAAATGTCTAGTTACTCCAAGTAACAGTAAGACTGTATGCTAACATAACTT
+CACTGAAAGCAGATGACATGAATTGGGTTATCAGTACCTTTGACTCCATCTCAGTTGGATCCTATATGTTGA
+GCACCTAGAGAAAGAGCACCTTTCACATACCTTTGTCACTGATATTTGATAATTTACCATTACATTTAATGC
+TTTAACTTTGGGCGTCTTCCTCCTTATGTGACCTTTCTAAAAATTTAAACATTTTGTGAAGAGATGAAACAT
+TAAGTCTTATGGCCAGATGTAGATGAGTTCATCAGCTGGTATATGCTATGATGATGCAATCCTGAGAACTTT
+ATAAAGGAGAAGACTCTGAAACAAGGAAACATACGGAGATGCTGAGAGATACTACCTCTTTCTCTTTGAACT
+TAAGGCTTGTCATTCATGACAACACTCATCCCCCCAGCAAACCAGGCTCCTGCCTAGGCTCAAATCTTTGTG
+AATATTCAAAGAACCTCTCTTATCCCAGTATCTGCTAGGGTTTTTTTCACTCTTCTAATAAAAGAGATTTAT
+AAAAAATATTCGAGCAAGATAGCAATTGAAGCAACATTAATGAATCTTCCCATTTTAAATCCTGAAACCAAA
+AATGTTTAAATGTTTAATTTTTTTTTTGAAATTGGAGAAAAAAGCCTTTGGGAGGTGATGGGATCATAGATA
+TTTGTATATGCCCAAATTCATCATTTGTACACATTAAATATGTGCAGTTCTTTTTATAGCAACTATACCTCA
+ATAAAGTTGTTTTTTTAAAAAAATCATGTTAGGTGGGTAGTAGGTAAGTAGCCATAGAAATATGTGGTGTTC
+ATTCCAGAACTACTTGAAATCTAACAATTTTTATTTAGAAATTCTCAGGGAAAACTAAGCTTTAACAACTAG
+TGGATCAAGCCAGAAGTTGGAGAGGAAACTGAGAATGAAACAGAAAGTATCACAGAACAATCATCACCAGAA
+AAATACAAAACAAATGACTTCCCAAATCCCTACCCCAGAATGCAAGGGCTATACTTATTTTCACATTGACAA
+CTGATTTATCCCAAGGTGGGACCTTAAGACACTTATAGTGACTCAATGGTTTATGTGCTTTCTTTTTCCCCT
+TTTAACAGTGAAATGTGCCGTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTG
+TCATGATTTTGAAAAACCAATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATGTAGAAGTTTGAAAT
+AAAAATCAGCATCCCTCAAAAGTAAAGCACAGTGAGTACAAGGATTAGTAATAAAAGAAACTGAATTAC
+>CDS|ENST00000398721/1-183 DEFB133-201
+ATGAAGATTCACGTCTTTCTCTTTGTTTTATTTTTCTTTCTGGTCCCCATTGCCACCAGGGTGAAATGTGCC
+GTGAAAGACACCTATAGTTGCTTTATCATGAGAGGCAAATGTCGACATGAGTGTCATGATTTTGAAAAACCA
+ATTGGTTTCTGCACAAAACTAAATGCCAACTGTTATATG
index 9ce4cc6..4973010 100644 (file)
@@ -30,6 +30,7 @@ import jalview.datamodel.SequenceI;
 import jalview.util.MapList;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 
@@ -156,22 +157,15 @@ public class ExonerateHelper extends Gff2Helper
     }
 
     /*
-     * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; 
+     * similarity start and end can tell us 
+     * which part of the alignment refers to which sequence
      */
-    SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
-            relaxedIdMatching);
-
-    /*
-     * If mapping is from protein to dna, we store it as dna to protein instead
-     */
-    SequenceI mapFromSequence = seq;
-    SequenceI mapToSequence = mappedSequence;
-    if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
-            || (type == MappingType.PeptideToNucleotide
-                    && !featureIsOnTarget))
-    {
-      mapFromSequence = mappedSequence;
-      mapToSequence = seq;
+    int similarityFrom,similarityTo;
+    try {
+      similarityFrom = Integer.parseInt(gff[START_COL]);
+      similarityTo = Integer.parseInt(gff[END_COL]);
+    } catch (Exception x) {
+      throw new IOException("Couldn't parse start/end of the similarity feature",x); 
     }
 
     /*
@@ -182,13 +176,6 @@ public class ExonerateHelper extends Gff2Helper
      */
 
     /*
-     * get any existing mapping for these sequences (or start one),
-     * and add this mapped range
-     */
-    AlignedCodonFrame acf = getMapping(align, mapFromSequence,
-            mapToSequence);
-
-    /*
      * exonerate GFF has the strand of the target in column 7
      * (differs from GFF3 which has it in the Target descriptor)
      */
@@ -205,6 +192,8 @@ public class ExonerateHelper extends Gff2Helper
     }
 
     List<String> alignedRegions = set.get(ALIGN);
+    List<MapList> mappings = new ArrayList<MapList>();
+    int fromLowest=0, fromHighest=0, toLowest=0, toHighest=0;
     for (String region : alignedRegions)
     {
       MapList mapping = buildMapping(region, type, forwardStrand,
@@ -215,6 +204,133 @@ public class ExonerateHelper extends Gff2Helper
         continue;
       }
 
+      /*
+       * record total extent of aligned region(s) for later
+       */
+      if (mappings.size() == 0)
+      {
+        if (mapping.getFromLowest() < mapping.getFromHighest())
+        {
+          fromLowest = mapping.getFromLowest();
+          fromHighest = mapping.getFromHighest();
+        }
+        else
+        {
+          fromLowest = mapping.getFromHighest();
+          fromHighest = mapping.getFromLowest();
+        }
+        if (mapping.getToLowest() < mapping.getToHighest())
+        {
+          toLowest = mapping.getToLowest();
+          toHighest = mapping.getToHighest();
+        }
+        else
+        {
+          toLowest = mapping.getToHighest();
+          toHighest = mapping.getToLowest();
+        }
+      }
+      else
+      {
+        int fl = mapping.getFromLowest(), fh = mapping.getFromHighest(),
+                tl = mapping.getToLowest(), th = mapping.getToHighest();
+        if (fl > fh)
+        {
+          fl = fh;
+          fh = mapping.getFromLowest();
+        }
+        if (tl > th)
+        {
+          tl = th;
+          th = mapping.getToLowest();
+        }
+        if (fromLowest > fl)
+
+        {
+          fromLowest = fl;
+        }
+        if (fromHighest < fh)
+        {
+          fromHighest = fh;
+        }
+        if (toLowest > tl)
+        {
+          toLowest = tl;
+        }
+        if (toHighest < th)
+        {
+          toHighest = th;
+        }
+      }
+      mappings.add(mapping);
+    }
+    
+    /*
+     * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; 
+     */
+    SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs,
+            relaxedIdMatching);
+    
+    /*
+     * finally, resolve the sense of the mapping 
+     */
+    SequenceI mapFromSequence = seq;
+    SequenceI mapToSequence = mappedSequence;
+
+    /*
+     * If mapping is from protein to dna, we store it as dna to protein instead
+     */
+    if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget)
+            || (type == MappingType.PeptideToNucleotide
+                    && !featureIsOnTarget))
+    {
+      mapFromSequence = mappedSequence;
+      mapToSequence = seq;
+    }
+    /*
+     * the sense of 'align' mappings for nucleotide alignments
+     * from exonerate seem to be ambiguous, so we need to do a bit more work
+     */
+    if (type == MappingType.NucleotideToNucleotide || type == MappingType.PeptideToPeptide)
+    {
+      /*
+       *  then check whether the aligned region is contained 
+       *  by the feature to determine sense of mapping
+       */
+      if (fromHighest==toHighest && fromLowest==toLowest)
+      {
+        // ambiguous case - for simple alignments this doesn't matter, but important for rearrangements or inversions
+        if (featureIsOnTarget)
+        {
+          // TODO: raise a warning since we don't have test coverage for this case
+          mapFromSequence=mappedSequence; // Target sequence 
+          mapToSequence=seq; // annotated sequence
+        }
+      } else if (similarityFrom == fromLowest && similarityTo == fromHighest)
+      {
+        mapFromSequence = seq;
+        mapToSequence = mappedSequence;
+      }
+      else if (similarityFrom == toLowest && similarityTo == toHighest)
+      {
+        mapFromSequence = mappedSequence;
+        mapToSequence = seq;
+      }
+      else
+      {
+        throw new IOException(
+                "Couldn't determine sense for similarity feature");
+      }
+    }
+    
+    /*
+     * get any existing mapping for these sequences (or start one),
+     * and add this mapped range
+     */
+    AlignedCodonFrame acf = getMapping(align, mapFromSequence,
+            mapToSequence);
+    for (MapList mapping : mappings)
+    {
       acf.addMap(mapFromSequence, mapToSequence, mapping);
     }
     align.addCodonFrame(acf);
index b753e94..0975750 100644 (file)
@@ -242,6 +242,48 @@ public class FeaturesFileTest
     assertEquals(2f, sf.getScore(), 0.001f);
   }
 
+  @Test(groups = { "Functional" })
+  public void testImportGFF2ExonerateCDSAndCoding2Genome()
+          throws IOException
+  {
+    /*
+     * test assumes sequence 1 in imported alignment is a 
+     * transcript shorter and aligned to exons on locus (sequence 0)
+     * 
+     * exonerate script was - where mode was query or target
+     * exonerate --showvulgar false --showalignment false --show${mode}gff ... > test_${mode}.gff2
+     * echo '##FASTA' >> test_${mode}.gff2
+     * cat example_Locus.fa example_CDS.fa >> test_${mode}.gff2  
+     * [ then edit out stuff before gff-version-2 header and the end of exonerate lines after the gff dump ]
+     */
+    String[][] testFiles = new String[][] {
+        { "test_cdna2genome_showquerygff.gff2",
+            "test_cdna2genome_showtargetgff.gff2" },
+        { "test_coding2genome_showquerygff.gff2",
+            "test_coding2genome_showtargetgff.gff2" } };
+
+    for (String[] testfilepair : testFiles)
+    {
+      FormatAdapter fa = new FormatAdapter();
+      AlignmentI al = fa.readFile("examples/testdata/" + testfilepair[0],
+              DataSourceType.FILE, FileFormat.Features);
+      
+      assertEquals(2, al.getHeight());
+      // check there are gaps in sequence 1
+      assertTrue(al.getSequenceAt(1).getSequenceAsString().contains(""+al.getGapCharacter()));
+      assertTrue(al.isAligned());
+      
+      AlignmentI al2 = fa.readFile("examples/testdata/" + testfilepair[1],
+              DataSourceType.FILE, FileFormat.Features);
+      
+      assertEquals(2, al2.getHeight());
+      assertTrue(al2.isAligned());
+      // check sequence 1 is identical for alignment imported from both query and target gff
+      assertEquals(al.getSequenceAt(1).getSequenceAsString(),
+              al2.getSequenceAt(1).getSequenceAsString());
+    }
+  }
+  
   public static AlignmentI readAlignmentFile(File f) throws IOException
   {
     System.out.println("Reading file: " + f);