package jalview.io;

import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
import static org.testng.AssertJUnit.assertNull;
import static org.testng.AssertJUnit.assertTrue;
import static org.testng.AssertJUnit.fail;

import jalview.datamodel.AlignmentAnnotation;
import jalview.datamodel.AlignmentI;
import jalview.datamodel.Sequence;
import jalview.datamodel.SequenceFeature;
import jalview.datamodel.SequenceI;

import java.io.IOException;
import java.util.List;
import java.util.Vector;

import org.testng.annotations.Test;

/*
 * Unit tests for MegaFile - read and write in MEGA format(s).
 */
public class MegaFileTest
{
  private static final String TWENTY_CHARS = "9876543210abcdefghij";

  private static final String THIRTY_CHARS = "0123456789klmnopqrstABCDEFGHIJ";

  //@formatter:off
  private static final String INTERLEAVED = 
          "#MEGA\n"+ 
          "TITLE: Interleaved sequence data\n\n" + 
          "#U455   ABCDEF\n" + 
          "#CPZANT  MNOPQR\n\n" + 
          "#U455   KLMNOP\n" + 
          "#CPZANT WXYZGC";

  private static final String INTERLEAVED_NOHEADERS = 
          "#U455   ABCDEF\n" 
          + "#CPZANT MNOPQR\n\n" 
          + "#U455   KLMNOP\n"
          + "#CPZANT WXYZGC\n";

  // interleaved sequences, with 50 residues
  private static final String INTERLEAVED_50RESIDUES = 
          "#MEGA\n"
          + "!TITLE Interleaved sequence data\n\n"
          + "#U455 " + THIRTY_CHARS + TWENTY_CHARS + "\n" 
          + "#CPZANT " + TWENTY_CHARS + THIRTY_CHARS + "\n";

  private static final String NONINTERLEAVED = 
          "#MEGA\n"
          + "!TITLE Noninterleaved sequence data\n\n" 
          + "#U455  \n"
          + "ABCFEDHIJ\n" 
          + "MNOPQR\n\n" 
          + "#CPZANT \n" 
          + "KLMNOPWXYZ\n" 
          + "CGATC\n";
  
  // this one starts interleaved then switches to non-interleaved
  private static final String MIXED = 
          "#MEGA\n"
          + "!TITLE This is a mess\n\n" 
          + "#CPZANT KLMNOPWXYZCGATC\n\n"
          + "#U455\n  "
          + "ABCFEDHIJ\n";

  // interleaved with a new sequence appearing in the second block :-O
  private static final String INTERLEAVED_SEQUENCE_ERROR = 
          "#MEGA" + "\n"
          + "!TITLE Interleaved sequence data\n\n"
          + "#U455   ABCDEF\n" 
          + "#CPZANT  MNOPQR\n\n"
          + "#U456   KLMNOP\n";

  // interleaved with description, bases/gaps in triplet groups
  private static final String INTERLEAVED_WITH_DESCRIPTION = 
          "#MEGA\n"
          + "!Title Data with description;\n"
          + "!Format DataType=DNA  indel=-\tCodeTable=Standard Missing=? MatchChar=.;\n\n"
          + "!Description\n" 
          + "    Line one of description\n"
          + "    Line two of description;\n\n"
          + "#U455   C-- GTA\n" 
          + "#CPZANT ATC -G-\n\n"
          + "#U455   CGA --T\n" 
          + "#CPZANT CA- -GC\n";

  //@formatter:on

  /**
   * Test parse of interleaved mega format data.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_interleaved() throws IOException
  {
    MegaFile testee = new MegaFile(INTERLEAVED, AppletFormatAdapter.PASTE);
    assertEquals("Title not as expected", "Interleaved sequence data",
            testee.getAlignmentProperty(MegaFile.PROP_TITLE));
    Vector<SequenceI> seqs = testee.getSeqs();
    // should be 2 sequences
    assertEquals("Expected two sequences", 2, seqs.size());
    // check sequence names correct and order preserved
    assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
    assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
            .getName());
    // check sequence data
    assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1)
            .getSequenceAsString());
    assertTrue("File format is not flagged as interleaved",
            testee.isInterleaved());
  }

  /**
   * Test parse of noninterleaved mega format data.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_nonInterleaved() throws IOException
  {
    MegaFile testee = new MegaFile(NONINTERLEAVED,
            AppletFormatAdapter.PASTE);
    assertEquals("Title not as expected", "Noninterleaved sequence data",
            testee.getAlignmentProperty(MegaFile.PROP_TITLE));
    Vector<SequenceI> seqs = testee.getSeqs();
    // should be 2 sequences
    assertEquals("Expected two sequences", 2, seqs.size());
    // check sequence names correct and order preserved
    assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
    assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
            .getName());
    // check sequence data
    assertEquals("First sequence data wrong", "ABCFEDHIJMNOPQR", seqs
            .get(0).getSequenceAsString());
    assertEquals("Second sequence data wrong", "KLMNOPWXYZCGATC",
            seqs.get(1).getSequenceAsString());
    assertFalse("File format is not flagged as noninterleaved",
            testee.isInterleaved());
  }

  /**
   * Test parsing an interleaved file with an extra sequence appearing after the
   * first block - should fail.
   */
  @Test(groups = { "Functional" })
  public void testParse_interleavedExtraSequenceError()
  {
    try
    {
      new MegaFile(INTERLEAVED_SEQUENCE_ERROR, AppletFormatAdapter.PASTE);
      fail("Expected extra sequence IOException");
    } catch (IOException e)
    {
      assertEquals(
              "Unexpected exception message",
              "Parse error: misplaced new sequence starting at #U456   KLMNOP",
              e.getMessage());
    }
  }

  /**
   * Test a mixed up file.
   */
  @Test(groups = { "Functional" })
  public void testParse_mixedInterleavedNonInterleaved()
  {
    try
    {
      new MegaFile(MIXED, AppletFormatAdapter.PASTE);
      fail("Expected mixed content exception");
    } catch (IOException e)
    {
      assertEquals(
              "Unexpected exception message",
              "Parse error: interleaved was true but now seems to be false, at line: ABCFEDHIJ",
              e.getMessage());
    }

  }

  @Test(groups = { "Functional" })
  public void testGetSequenceId()
  {
    assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGATC"));
    assertEquals("AB123", MegaFile.getSequenceId("#AB123    CGATC"));
    assertEquals("AB123", MegaFile.getSequenceId("#AB123 CGC TAC"));
    assertEquals("AB123", MegaFile.getSequenceId("#AB123"));
    assertNull(MegaFile.getSequenceId("AB123 CTAG"));
    assertNull(MegaFile.getSequenceId("AB123"));
    assertNull(MegaFile.getSequenceId(""));
    assertNull(MegaFile.getSequenceId(null));
  }

  @Test(groups = { "Functional" })
  public void testGetMaxIdLength()
  {
    SequenceI[] seqs = new Sequence[2];
    seqs[0] = new Sequence("Something", "GCATAC");
    seqs[1] = new Sequence("SomethingElse", "GCATAC");
    assertEquals(13, MegaFile.getMaxIdLength(seqs));
    seqs[1] = new Sequence("DNA", "GCATAC");
    assertEquals(9, MegaFile.getMaxIdLength(seqs));
  }

  @Test(groups = { "Functional" })
  public void testGetMaxSequenceLength()
  {
    SequenceI[] seqs = new Sequence[2];
    seqs[0] = new Sequence("Seq1", "GCATAC");
    seqs[1] = new Sequence("Seq2", "GCATACTAG");
    assertEquals(9, MegaFile.getMaxSequenceLength(seqs));
    seqs[1] = new Sequence("Seq2", "GCA");
    assertEquals(6, MegaFile.getMaxSequenceLength(seqs));
  }

  /**
   * Test (parse and) print of interleaved mega format data.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testPrint_interleaved() throws IOException
  {
    MegaFile testee = new MegaFile(INTERLEAVED, AppletFormatAdapter.PASTE);
    String printed = testee.print();
    System.out.println(printed);
    // normally output should match input
    // we cheated here with a number of short input lines
    // nb don't get Title in output if not calling print(AlignmentI)
    String expected = "#MEGA\n\n" + "#U455   ABCDEF [6]\n"
            + "#CPZANT MNOPQR [6]\n\n" + "#U455   KLMNOP [12]\n"
            + "#CPZANT WXYZGC [12]"
            + "\n";
    assertEquals("Print format wrong", expected, printed);
  }

  /**
   * Test (parse and) print of interleaved data with no headers (acceptable).
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testPrint_interleavedNoHeaders() throws IOException
  {
    MegaFile testee = new MegaFile(INTERLEAVED_NOHEADERS,
            AppletFormatAdapter.PASTE);
    String printed = testee.print();
    System.out.println(printed);

    //@formatter:off
    assertEquals("Print format wrong", 
    "#MEGA\n\n" + "#U455   ABCDEF [6]\n" 
    + "#CPZANT MNOPQR [6]\n\n" 
    + "#U455   KLMNOP [12]\n"
    + "#CPZANT WXYZGC [12]\n",
            printed);
    //@formatter:on
  }

  /**
   * Test (parse and) print of noninterleaved mega format data.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testPrint_noninterleaved() throws IOException
  {
    MegaFile testee = new MegaFile(NONINTERLEAVED,
            AppletFormatAdapter.PASTE);
    assertEquals(10, testee.getPositionsPerLine());
    String printed = testee.print();
    System.out.println(printed);
    // normally output should match input
    // we cheated here with a number of short input lines
    String expected = "#MEGA\n\n" + "#U455\n"
            + "ABCFEDHIJM [10]\nNOPQR [15]\n\n" + "#CPZANT\n"
            + "KLMNOPWXYZ [10]\nCGATC [15]\n";
    assertEquals("Print format wrong", expected, printed);
  }

  /**
   * Test (parse and) print of interleaved mega format data extending to more
   * than one line of output.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testPrint_interleavedMultiLine() throws IOException
  {
    MegaFile testee = new MegaFile(INTERLEAVED_50RESIDUES,
            AppletFormatAdapter.PASTE);
    assertEquals(50, testee.getPositionsPerLine());
    /*
     * now simulate choosing 20 residues per line on output
     */
    testee.setPositionsPerLine(20);
    String printed = testee.print();
    System.out.println(printed);
    //@formatter:off
    String expected = 
            "#MEGA\n\n" + 
            "#U455   0123456789 klmnopqrst [20]\n" + // first 20
            "#CPZANT 9876543210 abcdefghij [20]\n\n" +
            "#U455   ABCDEFGHIJ 9876543210 [40]\n" + // next 20
            "#CPZANT 0123456789 klmnopqrst [40]\n\n" +
            "#U455   abcdefghij [50]\n" + // last 10
            "#CPZANT ABCDEFGHIJ [50]\n";
    //@formatter:on
    assertEquals("Print format wrong", expected, printed);
  }

  /**
   * Test (parse and) print of noninterleaved mega format data extending to more
   * than one line of output.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testPrint_noninterleavedMultiLine() throws IOException
  {
    final String NONINTERLEAVED_LONGERTHAN50 = "#SIXTY\n" + THIRTY_CHARS
            + "\n" + TWENTY_CHARS + "9993332221\n";
    MegaFile testee = new MegaFile(NONINTERLEAVED_LONGERTHAN50,
            AppletFormatAdapter.PASTE);
    assertEquals(30, testee.getPositionsPerLine());
    testee.setPositionsPerLine(25);
    String printed = testee.print();

    /*
     * 25 positions per line is rounded down to 20 (two blocks of 10)
     */
    String expected = "#MEGA\n\n" + "#SIXTY\n"
            + "0123456789 klmnopqrst [20]\n"
            + "ABCDEFGHIJ 9876543210 [40]\n"
            + "abcdefghij 9993332221 [60]\n";
    assertEquals("Print format wrong", expected, printed);
  }

  /**
   * Test parse of data including description
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_withDescription() throws IOException
  {
    MegaFile testee = new MegaFile(INTERLEAVED_WITH_DESCRIPTION,
            AppletFormatAdapter.PASTE);
    assertEquals("Title not as expected", "Data with description",
            testee.getAlignmentProperty(MegaFile.PROP_TITLE));

    Vector<SequenceI> seqs = testee.getSeqs();
    // should be 2 sequences
    assertEquals("Expected two sequences", 2, seqs.size());
    // check sequence names correct and order preserved
    assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
    assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
            .getName());
    // check sequence data
    assertEquals("First sequence data wrong", "C--GTACGA--T", seqs.get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong", "ATC-G-CA--GC", seqs.get(1)
            .getSequenceAsString());
    assertTrue("File format is not flagged as interleaved",
            testee.isInterleaved());

    assertEquals(
            "Description property not parsed",
            "    Line one of description\n" + "    Line two of description",
            testee.getAlignmentProperty(MegaFile.PROP_DESCRIPTION));
  }

  @Test(groups = { "Functional" })
  public void testGetNonCommentContent() throws FileFormatException
  {
    assertEquals("abcde", MegaFile.getNonCommentContent("abcde", 0));
    assertEquals("CGT ACG GAC ",
            MegaFile.getNonCommentContent("CGT ACG GAC [9]", 0));
    assertEquals("", MegaFile.getNonCommentContent("abcde", 1));
    assertEquals(" abcde",
            MegaFile.getNonCommentContent("and others ] abcde", 1));
    assertEquals(" abcde", MegaFile.getNonCommentContent(
            "and others [including refs] ] abcde", 1));
    assertEquals(" x ] abcde",
            MegaFile.getNonCommentContent("and others ] x ] abcde", 1));
  }

  @Test(groups = { "Functional" })
  public void testCommentDepth() throws FileFormatException
  {
    assertEquals(0, MegaFile.commentDepth("abcde", 0));
    assertEquals(1, MegaFile.commentDepth("abc[de", 0));
    assertEquals(3, MegaFile.commentDepth("ab[c[de", 1));
    assertEquals(1, MegaFile.commentDepth("ab]c[d]e[f", 1));
    assertEquals(0, MegaFile.commentDepth("a]b[c]d]e", 1));
  }

  @Test(groups = { "Functional" })
  public void testGetValue()
  {
    assertEquals("Mega", MegaFile.getValue("Name=Mega"));
    assertEquals("Mega", MegaFile.getValue("Name =Mega"));
    assertEquals("Mega", MegaFile.getValue(" Name = Mega "));
    assertEquals("Mega", MegaFile.getValue("Name = Mega; "));
    assertEquals("Mega", MegaFile.getValue(" Name = Mega ; "));
    assertEquals("Mega", MegaFile.getValue("\t!Name \t= \tMega ; "));
    assertEquals("Mega", MegaFile.getValue("!Name \t\t Mega; "));
    assertEquals("", MegaFile.getValue("Name"));
  }

  /**
   * Test reading a MEGA file to an alignment then writing it out in MEGA
   * format. Verify the output is (functionally) the same as the input.
   * 
   * @throws IOException
   */
  @Test(groups = "Functional")
  public void testRoundTrip_Interleaved() throws IOException
  {
    AppletFormatAdapter fa = new AppletFormatAdapter();
    AlignmentI al = fa.readFile(INTERLEAVED_WITH_DESCRIPTION,
            AppletFormatAdapter.PASTE, "MEGA");
    MegaFile output = new MegaFile();
    String formatted = output.print(al);
    //@formatter:off
    String expected = 
         "#MEGA\n!Title Data with description;\n" +
         "!Description\n" +
         "    Line one of description\n" +
         "    Line two of description;\n" +
         "!Format\n" +
         "    DataType=DNA CodeTable=Standard\n" +
         "    NSeqs=2 NSites=12\n" + // NSites includes gaps
         "    Indel=- Identical=. Missing=?;\n\n" +
         "#U455   C-- GTA [6]\n" +
         "#CPZANT ATC -G- [6]\n\n" +
         "#U455   CGA --T [12]\n" +
         "#CPZANT CA- -GC [12]\n";
    //@formatter:on
    assertEquals("Roundtrip didn't match", expected,
            formatted);
  }

  /**
   * Test reading a MEGA file to an alignment then writing it out in MEGA
   * format. Verify the output is (functionally) the same as the input.
   * 
   * @throws IOException
   */
  @Test(groups = "Functional")
  public void testRoundTrip_multilineFormatWithComments()
          throws IOException
  {
    AppletFormatAdapter fa = new AppletFormatAdapter();
    //@formatter:off
    AlignmentI al = fa.readFile("#MEGA\n"
    + "!Title Data with description;\n"
    + "[ this comment should be ignored\n"
    + "including [this nested comment]\n"
    + "]\n"
    + "!Format \n"
    + "DataType=DNA CodeTable=Standard\n"
    + "indel=- Missing=? MatchChar=.;\n\n"
    + "!Description\n" 
    + "    Line one of description\n"
    + "    Line two of description;\n\n"
    + "#U455   CGC GTA\n" 
    + "#CPZANT ATC GGG\n\n"
    + "#U455   CGA TTT\n" 
    + "#CPZANT CAA TGC\n",
            AppletFormatAdapter.PASTE, "MEGA");
    //@formatter:on
    MegaFile output = new MegaFile();
    String formatted = output.print(al);
    //@formatter:off
    String expected = 
         "#MEGA\n!Title Data with description;\n" +
         "!Description\n" +
         "    Line one of description\n" +
         "    Line two of description;\n" +
         "!Format\n" +
         "    DataType=DNA CodeTable=Standard\n" +
         "    NSeqs=2 NSites=12\n" +
         "    Indel=- Identical=. Missing=?;\n\n" +
         "#U455   CGC GTA [6]\n" +
         "#CPZANT ATC GGG [6]\n\n" +
         "#U455   CGA TTT [12]\n" +
         "#CPZANT CAA TGC [12]\n";
    //@formatter:on
    assertEquals("Roundtrip didn't match", expected,
            formatted);
  }

  //@formatter:on
  
  /**
   * Test parse of interleaved mega format data where the identity character is
   * used in sequences after the first
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_interleavedWithIdentityAndTabs() throws IOException
  {
    //@formatter:off
    // uses tab instead of space separators to check robustness
    MegaFile testee = new MegaFile("#MEGA\n"+ 
    "!TITLE\tInterleaved sequence data;\n" +
    "!Format\tIdentical=.;\n\n" +
    "#U455\tABCDEF\n" + 
    "#CPZANT\tM..P.R\n\n" + 
    "#U455\t\tKLMNOP\n" +
    "#CPZANT\t..YZ..", AppletFormatAdapter.PASTE);
    //@formatter:on
    assertEquals("Title not as expected", "Interleaved sequence data",
            testee.getAlignmentProperty(MegaFile.PROP_TITLE));
    Vector<SequenceI> seqs = testee.getSeqs();
    // should be 2 sequences
    assertEquals("Expected two sequences", 2, seqs.size());
    // check sequence names correct and order preserved
    assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
    assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
            .getName());
    // check sequence data
    assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1)
            .getSequenceAsString());
    assertTrue("File format is not flagged as interleaved",
            testee.isInterleaved());
  }

  /**
   * Test parse of noninterleaved format data including identity symbol
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_nonInterleavedWithIdentity() throws IOException
  {
    //@formatter:off
    MegaFile testee = new MegaFile("#MEGA\n"
    + "!TITLE Noninterleaved sequence data;\n"
    + "!Format MatchChar=.;\n"
    + "#U455  \n"
    + "ABCFEDHIJ\n" 
    + "MNOPQR\n\n" 
    + "#CPZANT \n" 
    + "KL..O..XYZ\n" 
    + "CG..C\n",
            AppletFormatAdapter.PASTE);
    //@formatter:on
    assertEquals("Title not as expected", "Noninterleaved sequence data",
            testee.getAlignmentProperty(MegaFile.PROP_TITLE));
    Vector<SequenceI> seqs = testee.getSeqs();
    // should be 2 sequences
    assertEquals("Expected two sequences", 2, seqs.size());
    // check sequence names correct and order preserved
    assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
    assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
            .getName());
    // check sequence data
    assertEquals("First sequence data wrong", "ABCFEDHIJMNOPQR", seqs
            .get(0).getSequenceAsString());
    assertEquals("Second sequence data wrong", "KLCFODHXYZCGPQC",
            seqs.get(1).getSequenceAsString());
    assertFalse("File format is not flagged as noninterleaved",
            testee.isInterleaved());
  }

  //@formatter:on
  
  /**
   * Test parse of interleaved format data including position number comments.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_interleavedWithPositionNumber() throws IOException
  {
    //@formatter:off
    MegaFile testee = new MegaFile("#MEGA\n"+ 
    "TITLE: Interleaved sequence data\n\n" + 
    "#U455   ABCDEF [6]\n" + 
    "#CPZANT  MNOPQR [6]\n\n" + 
    "#U455   KLMNOP [12]\n" + 
    "#CPZANT WXYZGC [12]\n", AppletFormatAdapter.PASTE);
    //@formatter:on
    assertEquals("Title not as expected", "Interleaved sequence data",
            testee.getAlignmentProperty(MegaFile.PROP_TITLE));
    Vector<SequenceI> seqs = testee.getSeqs();
    // should be 2 sequences
    assertEquals("Expected two sequences", 2, seqs.size());
    // check sequence names correct and order preserved
    assertEquals("First sequence id wrong", "U455", seqs.get(0).getName());
    assertEquals("Second sequence id wrong", "CPZANT", seqs.get(1)
            .getName());
    // check sequence data
    assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong", "MNOPQRWXYZGC", seqs.get(1)
            .getSequenceAsString());
    assertTrue("File format is not flagged as interleaved",
            testee.isInterleaved());
  }

  //@formatter:on
  
  /**
   * Test parse of data with !Gene and !Domain statements.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_geneDomains() throws IOException
  {
    //@formatter:off
    String data = "#MEGA\n"+ 
    "TITLE: Interleaved sequence data\n\n" + 
    "#U455   CCCCCC\n" + 
    "#CPZANT  TTTTTT\n\n" +
    "!Domain=Exon1 Gene=Adh Property=Coding CodonStart=1;\n" +
    "#U455   GGGGGG\n" + 
    "#CPZANT AAAAAA\n\n" +
    "!domain=Intron1 Property=Intron Gene=Adh;\n" +
    "#U455   tttttt\n" + 
    "#CPZANT cccccc\n\n" +
    "!Domain=Exon2 Gene=Adh Property=Exon CodonStart=1;\n" +
    "#U455   aaaaaa\n" + 
    "#CPZANT gggggg\n\n" +
    // explicit end of Exon2, implicit end of Adh:
    "!Domain=Exon2 Property=domainend;\n" +
    "!Domain=Intron1 Gene=Opsin Property=Noncoding;\n" +
    "#U455   GGGGGG\n" + 
    "#CPZANT AAAAAA\n\n" +
    // end Opsin, start MEF2A
    "!Domain=Exon1 Gene=MEF2A Property=Coding CodonStart=1;\n" +
    "#U455   tttttt\n" + 
    "#CPZANT cccccc\n\n" +
    // end MEF2A
    "!Domain=BindingSite;\n" +
    "#U455   CCCCCC\n" + 
    "#CPZANT TTTTTT\n\n";
    //@formatter:on
    MegaFile testee = new MegaFile(data, AppletFormatAdapter.PASTE);

    Vector<SequenceI> seqs = testee.getSeqs();
    // should be 2 sequences
    assertEquals("Expected two sequences", 2, seqs.size());
    // check sequence data
    assertEquals("First sequence data wrong",
            "CCCCCCGGGGGGttttttaaaaaaGGGGGGttttttCCCCCC", seqs.get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong",
            "TTTTTTAAAAAAccccccggggggAAAAAAccccccTTTTTT", seqs.get(1)
            .getSequenceAsString());

    /*
     * sequences should have features for Gene=Adh 7-24, Exon1 7-12, Intron1
     * 13-18, Exon2 19-24, BindingSite 25-30
     */
    for (SequenceI seq : seqs) {
      SequenceFeature[] sfs = seq.getSequenceFeatures();
      // features are added in the order in which their end is found
      // (Domain before Gene when they end together)
      assertEquals(9, sfs.length);
      // TODO settle which way round type/description go!
      verifySequenceFeature(sfs[0], "Exon1 (Adh Coding)", "Domain", 7, 12);
      verifySequenceFeature(sfs[1], "Intron1 (Adh Noncoding)", "Domain",
              13, 18);
      verifySequenceFeature(sfs[2], "Exon2 (Adh Coding)", "Domain", 19, 24);
      verifySequenceFeature(sfs[3], "Adh", "Gene", 7, 24);
      verifySequenceFeature(sfs[4], "Intron1 (Opsin Noncoding)", "Domain",
              25, 30);
      verifySequenceFeature(sfs[5], "Opsin", "Gene", 25, 30);
      verifySequenceFeature(sfs[6], "Exon1 (MEF2A Coding)", "Domain", 31,
              36);
      verifySequenceFeature(sfs[7], "MEF2A", "Gene", 31, 36);
      verifySequenceFeature(sfs[8], "BindingSite", "Domain", 37, 42);
    }

    /*
     * verify gene and domain alignment annotations
     */
    assertEquals(2, testee.annotations.size());
    AlignmentAnnotation ann = testee.annotations.get(0);
    assertEquals("MEGA Gene", ann.label);
    assertEquals(42, ann.annotations.length);
    verifyAnnotation(ann, 0, 6, null);
    verifyAnnotation(ann, 6, 24, "Adh");
    verifyAnnotation(ann, 24, 30, "Opsin");
    verifyAnnotation(ann, 30, 36, "MEF2A");
    verifyAnnotation(ann, 37, 42, null);

    ann = testee.annotations.get(1);
    assertEquals("MEGA Domain", ann.label);
    assertEquals(42, ann.annotations.length);
    verifyAnnotation(ann, 0, 6, null);
    verifyAnnotation(ann, 6, 12, "Exon1 (Adh Coding)");
    verifyAnnotation(ann, 12, 18, "Intron1 (Adh Noncoding)");
    verifyAnnotation(ann, 19, 24, "Exon2 (Adh Coding)");
    verifyAnnotation(ann, 25, 30, "Intron1 (Opsin Noncoding)");
    verifyAnnotation(ann, 31, 36, "Exon1 (MEF2A Coding)");
    verifyAnnotation(ann, 37, 42, "BindingSite");

  }

  /**
   * Helper method to verify a range of annotation positions all have the given
   * description
   * 
   * @param ann
   *          array of annotations to check
   * @param from
   *          start index to check
   * @param to
   *          end index to check (exclusive)
   * @param description
   *          value to assert
   */
  protected void verifyAnnotation(AlignmentAnnotation ann, int from,
          int to, String description)
  {
    for (int pos = from; pos < to; pos++)
    {
      if (description == null)
      {
        assertNull(ann.annotations[pos]);
      }
      else
      {
        assertEquals(description, ann.annotations[pos].description);
      }
    }
  }

  /**
   * Helper method to assert properties of a SequenceFeature
   * 
   * @param sf
   * @param description
   * @param type
   * @param begin
   * @param end
   */
  protected void verifySequenceFeature(SequenceFeature sf,
          String description, String type, int begin, int end)
  {
    assertEquals(description, sf.type);
    assertEquals(type, sf.description);
    assertEquals(begin, sf.begin);
    assertEquals(end, sf.end);
  }

  //@formatter:on
  
  /**
   * Test parse of data including !Label statements. An underscore means no
   * label, other characters are treated as alignment annotation.
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_withLabels() throws IOException
  {
    //@formatter:off
    MegaFile testee = new MegaFile("#MEGA\n"+ 
    "TITLE: Interleaved sequence data\n\n" + 
    "#U455   ABC DEF\n" + 
    "#CPZANT MNO PQR\n" +
    "!Label  +-_ 23_\n\n" +
    // a row with no labels = null annotation
    "#U455   abc def\n" + 
    "#CPZANT mno pqr\n\n" +
    "#U455   KLM NOP\n" + 
    "#CPZANT WXY ZGC\n" +
    "!label  __3 +X_\n", AppletFormatAdapter.PASTE);
    //@formatter:on
    Vector<SequenceI> seqs = testee.getSeqs();
    assertEquals("Expected two sequences", 2, seqs.size());
    assertEquals("First sequence data wrong", "ABCDEFabcdefKLMNOP", seqs
            .get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong", "MNOPQRmnopqrWXYZGC", seqs
            .get(1)
            .getSequenceAsString());

    // check AlignmentAnnotation added with expected values
    assertEquals(1, testee.annotations.size());
    AlignmentAnnotation aa = testee.annotations.get(0);
    assertNull(aa.sequenceRef);
    assertEquals("MEGA Label", aa.label);
    assertEquals(18, aa.annotations.length);
    assertEquals("+, -, , 2, 3, , , , , , , , , , 3, +, X, , ",
            aa.toString());
  }

  //@formatter:on
  
  /**
   * Test case where a domain is implicitly terminated by starting a new gene
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_changeOfGeneEndsDomain() throws IOException
  {
    //@formatter:off
    // uses tab instead of space separators to check robustness
    MegaFile testee = new MegaFile("#MEGA\n"+ 
    "!TITLE Interleaved sequence data;\n" +
    "!Format Identical=.;\n\n" +
    "!Gene=gene1 Domain=Exon1 Property=Coding;\n" +
    "#U455 ABCDEF\n" + 
    "#CPZANT M..P.R\n\n" + 
    "!Gene=gene2;\n" +
    "#U455 KLMNOP\n" +
    "#CPZANT ..YZ..", AppletFormatAdapter.PASTE);
    //@formatter:on
    Vector<SequenceI> seqs = testee.getSeqs();
    assertEquals("Expected two sequences", 2, seqs.size());
    assertEquals("First sequence data wrong", "ABCDEFKLMNOP", seqs.get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong", "MBCPERKLYZOP", seqs.get(1)
            .getSequenceAsString());
    assertTrue("File format is not flagged as interleaved",
            testee.isInterleaved());

    for (SequenceI seq : seqs)
    {
      SequenceFeature[] sfs = seq.getSequenceFeatures();
      assertEquals(3, sfs.length);
      verifySequenceFeature(sfs[0], "Exon1 (gene1 Coding)", "Domain", 1, 6);
      verifySequenceFeature(sfs[1], "gene1", "Gene", 1, 6);
      verifySequenceFeature(sfs[2], "gene2", "Gene", 7, 12);
    }
  }

  //@formatter:on
  
  /**
   * Test case where the declared gap character is one Jalview does not support;
   * it should be converted to a '-'
   * 
   * @throws IOException
   */
  @Test(groups = { "Functional" })
  public void testParse_weirdGapCharacter() throws IOException
  {
    //@formatter:off
    String data = "#MEGA\n"+ 
    "!TITLE Interleaved sequence data;\n" +
    "!Format Identical=. Indel=%;\n\n" +
    "#U455 %BC%EF\n" + 
    "#CPZANT M..P.R\n\n" + 
    "#U455 KLMNOP\n" +
    "#CPZANT .%%Z..";
    AppletFormatAdapter fa = new AppletFormatAdapter();
    AlignmentI al = fa.readFile(data,
            AppletFormatAdapter.PASTE, "MEGA");
    //@formatter:on
    List<SequenceI> seqs = al.getSequences();
    assertEquals("First sequence data wrong", "-BC-EFKLMNOP", seqs.get(0)
            .getSequenceAsString());
    assertEquals("Second sequence data wrong", "MBCPERK--ZOP", seqs.get(1)
            .getSequenceAsString());
    assertEquals('-', al.getGapCharacter());
  }

  /**
   * Test reading a MEGA file to an alignment then writing it out in MEGA
   * format. Includes !Label statements which should be converted to
   * AlignmentAnnotation and back again.
   * 
   * @throws IOException
   */
  @Test(groups = "Functional")
  public void testRoundTrip_withLabels() throws IOException
  {
    AppletFormatAdapter fa = new AppletFormatAdapter();

    //@formatter:off
    String data = "#MEGA\n"
    + "#U455   C-- GTA\n" 
    + "#CPZANT ATC -G-\n"
    + "!Label F__E_H\n\n"
    + "#U455   CGA --T\n" 
    + "#CPZANT CA- -GC\n"
    + "!Label FFH__E\n";
    AlignmentI al = fa.readFile(data,
            AppletFormatAdapter.PASTE, "MEGA");
    AlignmentAnnotation aa = al.getAlignmentAnnotation()[0];
    assertEquals("MEGA Label", aa.label);
    assertEquals("F, , , E, , H, F, F, H, , , E, ",
            aa.toString());

    MegaFile output = new MegaFile();
    String formatted = output.print(al);
    String expected = 
        "#MEGA\n" +
        "!Format\n" +
        "    DataType=Nucleotide CodeTable=Standard\n" +
        "    NSeqs=2 NSites=12\n" +
        "    Indel=-;\n\n" +
        "#U455   C-- GTA [6]\n" +
        "#CPZANT ATC -G- [6]\n" +
        "!Label F__ E_H;\n\n" +  
        "#U455   CGA --T [12]\n" +
        "#CPZANT CA- -GC [12]\n" +
        "!Label FFH __E;\n";
    //@formatter:on
    assertEquals("Roundtrip didn't match", expected,
            formatted);
  }
}