#include "tests.h" #include "json-schema-to-grammar.h" #include static std::string trim_leading_space(const std::string & s) { static const std::regex leading_ws_re = std::regex(R"((^|\n)\s+)"); return std::regex_replace(s, leading_ws_re, "$1"); } static void assert_gbnf_equal(testing & t, const std::string & expected, const std::string & actual) { t.assert_equal("gbnf are equal", trim_leading_space(expected), trim_leading_space(actual)); } void test_gbnf_generation(testing &t) { t.test("literal grammar generation", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello"); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= "hello" space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); t.test("char class grammar", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.chars("[a-z]", 1, 1); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= [a-z] space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); t.test("sequence grammar", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.literal(" ") + p.literal("world"); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= "hello" " " "world" space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); t.test("choice grammar", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("cat") | p.literal("dog"); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= "cat" | "dog" space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); t.test("one_or_more grammar", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.one_or_more(p.literal("a")); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= "a"+ space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); t.test("zero_or_more grammar", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.zero_or_more(p.literal("a")); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= "a"* space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); t.test("optional grammar", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.literal("hello") + p.optional(p.literal(" world")); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= "hello" " world"? space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); t.test("until grammar", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.until(""); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= until-0 space ::= | " " | "\n"{1,2} [ \t]{0,20} until-0 ::= | [<] until-0-01 | [^<] until-0 until-0-01 ::= | [<] until-0-01 | [/] until-0-02 | [^/<] until-0 until-0-02 ::= | [<] until-0-01 | [t] until-0-03 | [^] until-0 )""", gbnf); }); t.test("until grammar overlapping delimiter", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.until("\n\n"); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= until-0 space ::= | " " | "\n"{1,2} [ \t]{0,20} until-0 ::= | [\n] until-0-01 | [^\n] until-0 until-0-01 ::= | [\n] until-0-01 | [<] until-0-02 | [^\n<] until-0 until-0-02 ::= | [\n] until-0-01 | [/] until-0-03 | [^\n/] until-0 until-0-03 ::= | [\n] until-0-01 | [p] until-0-04 | [^\np] until-0 until-0-04 ::= | [\n] until-0-01 | [a] until-0-05 | [^\na] until-0 until-0-05 ::= | [\n] until-0-01 | [r] until-0-06 | [^\nr] until-0 until-0-06 ::= | [\n] until-0-01 | [a] until-0-07 | [^\na] until-0 until-0-07 ::= | [\n] until-0-01 | [m] until-0-08 | [^\nm] until-0 until-0-08 ::= | [\n] until-0-01 | [e] until-0-09 | [^\ne] until-0 until-0-09 ::= | [\n] until-0-01 | [t] until-0-10 | [^\nt] until-0 until-0-10 ::= | [\n] until-0-01 | [e] until-0-11 | [^\ne] until-0 until-0-11 ::= | [\n] until-0-01 | [r] until-0-12 | [^\nr] until-0 until-0-12 ::= | [\n] until-0-01 | [>] until-0-13 | [^\n>] until-0 until-0-13 ::= | [^\n] until-0 )""", gbnf); }); // DeepSeek-V3.2 tag prefix. The DSML token (｜DSML｜) embeds U+FF5C, // so the delimiter mixes ASCII and multi-byte codepoints. t.test("until grammar unicode delimiter", [](testing &t) { auto parser = build_peg_parser([](common_peg_parser_builder & p) { return p.until("<｜DSML｜"); }); auto gbnf = build_grammar([&](const common_grammar_builder & builder) { parser.build_grammar(builder); }); assert_gbnf_equal(t, R"""( root ::= until-0 space ::= | " " | "\n"{1,2} [ \t]{0,20} until-0 ::= | [<] until-0-01 | [^<] until-0 until-0-01 ::= | [<] until-0-01 | [\uFF5C] until-0-02 | [^<\uFF5C] until-0 until-0-02 ::= | [<] until-0-01 | [D] until-0-03 | [^