From: gmungoc Date: Thu, 8 Oct 2015 08:57:17 +0000 (+0100) Subject: JAL-1499 convert unsupported gap character to '-' X-Git-Url: http://source.jalview.org/gitweb/?a=commitdiff_plain;h=7fd7bf631f5132771260baa81a957457ceb89694;p=jalview.git JAL-1499 convert unsupported gap character to '-' --- diff --git a/src/jalview/io/MegaFile.java b/src/jalview/io/MegaFile.java index 3096b60..97d7775 100644 --- a/src/jalview/io/MegaFile.java +++ b/src/jalview/io/MegaFile.java @@ -24,6 +24,7 @@ import jalview.datamodel.Annotation; import jalview.datamodel.Sequence; import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; +import jalview.util.Comparison; import java.io.IOException; import java.util.ArrayList; @@ -143,6 +144,8 @@ public class MegaFile extends AlignFile private static final String TAB = "\t"; + private static final char DEFAULT_GAP = '-'; + /* * number of sequence positions output per line */ @@ -151,7 +154,7 @@ public class MegaFile extends AlignFile private String title; // gap character may be explicitly declared, default is - - private char gapCharacter = '-'; + private char gapCharacter = DEFAULT_GAP; // identity character if declared private char identityCharacter = 0; @@ -219,7 +222,7 @@ public class MegaFile extends AlignFile @Override public void parse() throws IOException { - gapCharacter = '-'; + gapCharacter = DEFAULT_GAP; sequenceFeatures = new HashMap>(); geneStart = new HashMap(); domainStart = new HashMap(); @@ -954,18 +957,22 @@ public class MegaFile extends AlignFile for (int i = 0; i < formatted.length(); i++) { char nextChar = formatted.charAt(i); - if (nextChar != gapCharacter) - { - nonGapped++; - } - if (nextChar == identityCharacter - && len + i < referenceSequence.length()) + if (nextChar == gapCharacter) { - sb1.append(referenceSequence.charAt(len + i)); + sb1.append(Comparison.isGap(nextChar) ? nextChar : DEFAULT_GAP); } else { - sb1.append(nextChar); + nonGapped++; + if (nextChar == identityCharacter + && len + i < referenceSequence.length()) + { + sb1.append(referenceSequence.charAt(len + i)); + } + else + { + sb1.append(nextChar); + } } } formatted = sb1.toString(); @@ -1179,6 +1186,11 @@ public class MegaFile extends AlignFile else if (keyword.equalsIgnoreCase(INDEL)) { this.gapCharacter = value.charAt(0); + if (!Comparison.isGap(gapCharacter)) + { + System.err.println("Jalview doesn't support '" + gapCharacter + + "' for gaps, will be converted to '" + DEFAULT_GAP + "'"); + } } else if (keyword.equalsIgnoreCase(IDENTICAL) @@ -1604,9 +1616,14 @@ public class MegaFile extends AlignFile public void addProperties(AlignmentI al) { super.addProperties(al); - al.setGapCharacter(gapCharacter); /* + * record gap character specified, but convert to '-' if not one we support + */ + al.setGapCharacter(Comparison.isGap(gapCharacter) ? gapCharacter + : DEFAULT_GAP); + + /* * warn if e.g. DataType=DNA but data is protein (or vice versa) */ if (this.nucleotide != null && this.nucleotide != al.isNucleotide()) { diff --git a/test/jalview/io/MegaFileTest.java b/test/jalview/io/MegaFileTest.java index 2b2422f..f7d83c0 100644 --- a/test/jalview/io/MegaFileTest.java +++ b/test/jalview/io/MegaFileTest.java @@ -13,6 +13,7 @@ import jalview.datamodel.SequenceFeature; import jalview.datamodel.SequenceI; import java.io.IOException; +import java.util.List; import java.util.Vector; import org.testng.annotations.Test; @@ -777,4 +778,35 @@ public class MegaFileTest verifySequenceFeature(sfs[2], "gene2", "Gene", 7, 12); } } + + //@formatter:on + + /** + * Test case where the declared gap character is one Jalview does not support; + * it should be converted to a '-' + * + * @throws IOException + */ + @Test(groups = { "Functional" }) + public void testParse_weirdGapCharacter() throws IOException + { + //@formatter:off + String data = "#MEGA\n"+ + "!TITLE Interleaved sequence data;\n" + + "!Format Identical=. Indel=%;\n\n" + + "#U455 %BC%EF\n" + + "#CPZANT M..P.R\n\n" + + "#U455 KLMNOP\n" + + "#CPZANT .%%Z.."; + AppletFormatAdapter fa = new AppletFormatAdapter(); + AlignmentI al = fa.readFile(data, + AppletFormatAdapter.PASTE, "MEGA"); + //@formatter:on + List seqs = al.getSequences(); + assertEquals("First sequence data wrong", "-BC-EFKLMNOP", seqs.get(0) + .getSequenceAsString()); + assertEquals("Second sequence data wrong", "MBCPERK--ZOP", seqs.get(1) + .getSequenceAsString()); + assertEquals('-', al.getGapCharacter()); + } }