From dae56c38c3f14e96308540c30f35ca8f1d917edf Mon Sep 17 00:00:00 2001 From: Jim Procter Date: Wed, 3 Feb 2021 16:34:45 +0000 Subject: [PATCH] =?utf8?q?JAL-3808=20handle=201:1=20mappings=20in=20exonerat?= =?utf8?q?e=20gff2=20using=20the=20=E2=80=98similarity=E2=80=99=20feature=20?= =?utf8?q?extent=20to=20determine=20direction=20of=20mapping=20generated=20f?= =?utf8?q?rom=20alignment?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- src/jalview/io/gff/ExonerateHelper.java | 160 ++++++++++++++++++++++++++----- 1 file changed, 138 insertions(+), 22 deletions(-) diff --git a/src/jalview/io/gff/ExonerateHelper.java b/src/jalview/io/gff/ExonerateHelper.java index da0c245..ab39ce9 100644 --- a/src/jalview/io/gff/ExonerateHelper.java +++ b/src/jalview/io/gff/ExonerateHelper.java @@ -28,6 +28,7 @@ import jalview.datamodel.SequenceI; import jalview.util.MapList; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -154,22 +155,15 @@ public class ExonerateHelper extends Gff2Helper } /* - * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; + * similarity start and end can tell us + * which part of the alignment refers to which sequence */ - SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs, - relaxedIdMatching); - - /* - * If mapping is from protein to dna, we store it as dna to protein instead - */ - SequenceI mapFromSequence = seq; - SequenceI mapToSequence = mappedSequence; - if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget) - || (type == MappingType.PeptideToNucleotide - && !featureIsOnTarget)) - { - mapFromSequence = mappedSequence; - mapToSequence = seq; + int similarityFrom,similarityTo; + try { + similarityFrom = Integer.parseInt(gff[START_COL]); + similarityTo = Integer.parseInt(gff[END_COL]); + } catch (Exception x) { + throw new IOException("Couldn't parse start/end of the similarity feature",x); } /* @@ -180,13 +174,6 @@ public class ExonerateHelper extends Gff2Helper */ /* - * get any existing mapping for these sequences (or start one), - * and add this mapped range - */ - AlignedCodonFrame acf = getMapping(align, mapFromSequence, - mapToSequence); - - /* * exonerate GFF has the strand of the target in column 7 * (differs from GFF3 which has it in the Target descriptor) */ @@ -203,6 +190,8 @@ public class ExonerateHelper extends Gff2Helper } List alignedRegions = set.get(ALIGN); + List mappings = new ArrayList(); + int fromLowest=0, fromHighest=0, toLowest=0, toHighest=0; for (String region : alignedRegions) { MapList mapping = buildMapping(region, type, forwardStrand, @@ -213,6 +202,133 @@ public class ExonerateHelper extends Gff2Helper continue; } + /* + * record total extent of aligned region(s) for later + */ + if (mappings.size() == 0) + { + if (mapping.getFromLowest() < mapping.getFromHighest()) + { + fromLowest = mapping.getFromLowest(); + fromHighest = mapping.getFromHighest(); + } + else + { + fromLowest = mapping.getFromHighest(); + fromHighest = mapping.getFromLowest(); + } + if (mapping.getToLowest() < mapping.getToHighest()) + { + toLowest = mapping.getToLowest(); + toHighest = mapping.getToHighest(); + } + else + { + toLowest = mapping.getToHighest(); + toHighest = mapping.getToLowest(); + } + } + else + { + int fl = mapping.getFromLowest(), fh = mapping.getFromHighest(), + tl = mapping.getToLowest(), th = mapping.getToHighest(); + if (fl > fh) + { + fl = fh; + fh = mapping.getFromLowest(); + } + if (tl > th) + { + tl = th; + th = mapping.getToLowest(); + } + if (fromLowest > fl) + + { + fromLowest = fl; + } + if (fromHighest < fh) + { + fromHighest = fh; + } + if (toLowest > tl) + { + toLowest = tl; + } + if (toHighest < th) + { + toHighest = th; + } + } + mappings.add(mapping); + } + + /* + * locate the mapped sequence in the alignment or 'new' (GFF file) sequences; + */ + SequenceI mappedSequence = findSequence(mapTo.get(0), align, newseqs, + relaxedIdMatching); + + /* + * finally, resolve the sense of the mapping + */ + SequenceI mapFromSequence = seq; + SequenceI mapToSequence = mappedSequence; + + /* + * If mapping is from protein to dna, we store it as dna to protein instead + */ + if ((type == MappingType.NucleotideToPeptide && featureIsOnTarget) + || (type == MappingType.PeptideToNucleotide + && !featureIsOnTarget)) + { + mapFromSequence = mappedSequence; + mapToSequence = seq; + } + /* + * the sense of 'align' mappings for nucleotide alignments + * from exonerate seem to be ambiguous, so we need to do a bit more work + */ + if (type == MappingType.NucleotideToNucleotide || type == MappingType.PeptideToPeptide) + { + /* + * then check whether the aligned region is contained + * by the feature to determine sense of mapping + */ + if (fromHighest==toHighest && fromLowest==toLowest) + { + // ambiguous case - for simple alignments this doesn't matter, but important for rearrangements or inversions + if (featureIsOnTarget) + { + // TODO: raise a warning since we don't have test coverage for this case + mapFromSequence=mappedSequence; // Target sequence + mapToSequence=seq; // annotated sequence + } + } else if (similarityFrom == fromLowest && similarityTo == fromHighest) + { + mapFromSequence = seq; + mapToSequence = mappedSequence; + } + else if (similarityFrom == toLowest && similarityTo == toHighest) + { + mapFromSequence = mappedSequence; + mapToSequence = seq; + } + else + { + throw new IOException( + "Couldn't determine sense for similarity feature"); + } + } + + /* + * get any existing mapping for these sequences (or start one), + * and add this mapped range + */ + AlignedCodonFrame acf = getMapping(align, mapFromSequence, + mapToSequence); + for (MapList mapping : mappings) + { acf.addMap(mapFromSequence, mapToSequence, mapping); } align.addCodonFrame(acf); -- 1.7.10.2