From fb401045cc284ec185bf3ce6eca878a2c288f33b Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 24 Jun 2026 18:12:16 +0200 Subject: [PATCH] common: remove unused json-partial (#24968) --- common/CMakeLists.txt | 2 - common/json-partial.cpp | 324 ------------------------------------ common/json-partial.h | 39 ----- tests/CMakeLists.txt | 1 - tests/test-json-partial.cpp | 287 -------------------------------- 5 files changed, 653 deletions(-) delete mode 100644 common/json-partial.cpp delete mode 100644 common/json-partial.h delete mode 100644 tests/test-json-partial.cpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index c42320c46b..fc16b21cf1 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -80,8 +80,6 @@ add_library(${TARGET} http.h imatrix-loader.cpp imatrix-loader.h - json-partial.cpp - json-partial.h json-schema-to-grammar.cpp llguidance.cpp log.cpp diff --git a/common/json-partial.cpp b/common/json-partial.cpp deleted file mode 100644 index aaf11310ab..0000000000 --- a/common/json-partial.cpp +++ /dev/null @@ -1,324 +0,0 @@ -#include "json-partial.h" - -#include "log.h" - -#include - -#include -#include - -using json = nlohmann::ordered_json; - -enum common_json_stack_element_type { - COMMON_JSON_STACK_ELEMENT_OBJECT, - COMMON_JSON_STACK_ELEMENT_KEY, - COMMON_JSON_STACK_ELEMENT_ARRAY, -}; - -struct common_json_stack_element { - common_json_stack_element_type type; - std::string key; -}; - -bool common_json_parse( - const std::string & input, - const std::string & healing_marker, - common_json & out) -{ - std::string::const_iterator it = input.begin(); - const auto end = input.end(); - return common_json_parse(it, end, healing_marker, out); -} - -bool common_json_parse( - std::string::const_iterator & it, - const std::string::const_iterator & end, - const std::string & healing_marker, - common_json & out) -{ - // // https://json.nlohmann.me/features/parsing/sax_interface/ - struct json_error_locator : public nlohmann::json_sax { - std::size_t position; - bool found_error; - std::string last_token; - std::string exception_message; - std::vector stack; - - json_error_locator() : position(0), found_error(false) {} - - bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT - this->position = position - 1; - this->found_error = true; - this->last_token = last_token; - this->exception_message = ex.what(); - return false; - } - void close_value() { - if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) { - stack.pop_back(); - } - } - bool null() override { // NOLINT - close_value(); - return true; - } - bool boolean(bool) override { // NOLINT - close_value(); - return true; - } - bool number_integer(number_integer_t) override { // NOLINT - close_value(); - return true; - } - bool number_unsigned(number_unsigned_t) override { // NOLINT - close_value(); - return true; - } - bool number_float(number_float_t, const string_t &) override { // NOLINT - close_value(); - return true; - } - bool string(string_t &) override { // NOLINT - close_value(); - return true; - } - bool binary(binary_t &) override { // NOLINT - close_value(); - return true; - } - bool start_object(std::size_t) override { // NOLINT - stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""}); - return true; - } - bool end_object() override { - GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT); - stack.pop_back(); - close_value(); - return true; - } - bool key(string_t & key) override { // NOLINT - stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key}); - return true; - } - bool start_array(std::size_t) override { // NOLINT - stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""}); - return true; - } - bool end_array() override { - GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY); - stack.pop_back(); - close_value(); - return true; - } - }; - json_error_locator err_loc; - auto start = it; - json::sax_parse(it, end, &err_loc); - - if (err_loc.found_error) { - it = start; - auto temptative_end = it + err_loc.position; - // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str()); - - auto input = std::string(it, temptative_end); - try { - out.json = json::parse(input); - // out.json = json::parse(it, temptative_end); - it = temptative_end; - return true; - } catch (const std::exception & ex) { - // No, needs healing. - LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str()); - } - auto can_parse = [](const std::string & str) { - try { - auto _ = json::parse(str); // NOLINT - return true; - } catch (const std::exception &) { - return false; - } - }; - if (!healing_marker.empty() && !err_loc.stack.empty()) { - std::string str(it, temptative_end); - auto last_non_sp_pos = str.find_last_not_of(" \n\r\t"); - if (last_non_sp_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location"); - } - auto last_non_sp_char = str[last_non_sp_pos]; - // Used to detect stops on a number, which may not be complete. - auto was_maybe_number = [&]() { - if (!str.empty() && std::isspace(str.back())) { - return false; - } - return std::isdigit(last_non_sp_char) || - last_non_sp_char == '.' || - last_non_sp_char == 'e' || - last_non_sp_char == 'E' || - last_non_sp_char == '-'; - }; - - std::string closing; - for (size_t i = err_loc.stack.size(); i > 0; i--) { - auto & el = err_loc.stack[i - 1]; - if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) { - closing += "}"; - } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) { - closing += "]"; - } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) { - throw std::runtime_error("Unexpected stack element type"); - } - } - - // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX - static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)"); - - auto is_high_surrogate = [&](const std::string & s) { - // Check if a partial of a high surrogate (U+D800-U+DBFF) - return s.length() >= 4 && - s[0] == '\\' && s[1] == 'u' && - std::tolower(s[2]) == 'd' && - (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b'); - }; - - // Initialize the unicode marker to a low surrogate to handle the edge case - // where a high surrogate (U+D800-U+DBFF) is immediately followed by a - // backslash (\) - std::string unicode_marker_padding = "udc00"; - std::smatch last_unicode_seq; - - if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) { - std::smatch second_last_seq; - std::string prelude = str.substr(0, last_unicode_seq.position()); - - // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters - unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0'); - - if (is_high_surrogate(last_unicode_seq.str())) { - // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF) - unicode_marker_padding += "\\udc00"; - } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) { - if (is_high_surrogate(second_last_seq.str())) { - // If this follows a high surrogate, pad it to be a low surrogate - if (last_unicode_seq.length() == 2) { - unicode_marker_padding = "dc00"; - } else if (last_unicode_seq.length() == 3) { - unicode_marker_padding = "c00"; - } else { - // The original unicode_marker_padding is already padded with 0s - } - } - } - } - - const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$"; - - if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) { - // We're inside an object value - if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) { - // Was about to create an object value - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } else if (can_parse(str + ": 1" + closing)) { - str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing; - } else if (last_non_sp_char == '{' && can_parse(str + closing)) { - // Was about to create an object - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing; - } else if (can_parse(str + "\"" + closing)) { - // Was inside an object value string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) { - // Was inside an object value string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing; - } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) { - // Was inside an object value string after a partial unicode escape - str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing; - } else { - // find last : - auto last_pos = str.find_last_of(':'); - if (last_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location"); - } - // Cutting back to opening : for object value - str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } - } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) { - if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) { - // Was about to create an array value - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } else if (can_parse(str + "\"" + closing)) { - // Was inside an array value string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) { - // Was inside an array value string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing; - } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) { - // Was inside an array value string after a partial unicode escape - str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing; - } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) { - // Had just finished a value - str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing; - } else { - auto last_pos = str.find_last_of("[,"); - if (last_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location"); - } - // Cutting back to last [ or , for array value - str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } - } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) { - if ((last_non_sp_char == '{' && can_parse(str + closing)) || - (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) { - // Was about to create an object key+value - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing; - } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) { - // Was about to create an object key+value - str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing; - } else if (can_parse(str + "\": 1" + closing)) { - // Was inside an object key string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) { - // Was inside an object key string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing; - } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) { - // Was inside an object key string after a partial unicode escape - str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing; - } else { - auto last_pos = str.find_last_of(':'); - if (last_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location"); - } - // fprintf(stderr, "Cutting back to last : for object key+value\n"); - str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } - } else { - throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location"); - } - // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str()); - out.json = json::parse(str); - it = temptative_end; - return true; - } - // handle unclosed top-level primitive - if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) { - std::string str(it, temptative_end); - const auto & magic_seed = out.healing_marker.marker = healing_marker; - if (can_parse(str + "\"")) { - // Was inside an string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\""; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) { - // Was inside an string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\""; - } else { - // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...) - // fprintf(stderr, "Closing: TODO\n"); - return false; - } - out.json = json::parse(str); - it = temptative_end; - return true; - } - return false; - } - out.json = json::parse(it, end); - it = end; - return true; -} diff --git a/common/json-partial.h b/common/json-partial.h deleted file mode 100644 index be51aabfbf..0000000000 --- a/common/json-partial.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -// TODO: use json_fwd.hpp when possible -#include - -// Healing marker (empty if the JSON was fully parsed / wasn't healed). -struct common_healing_marker { - // Raw marker. - std::string marker; - - // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format). - std::string json_dump_marker; -}; - -// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string) -struct common_json { - nlohmann::ordered_json json; - - common_healing_marker healing_marker; -}; - -// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty. -// -// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON. -// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker. -// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format). -// -// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again). -bool common_json_parse( - const std::string & input, - const std::string & healing_marker, - common_json & out); - -// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds. -bool common_json_parse( - std::string::const_iterator & it, - const std::string::const_iterator & end, - const std::string & healing_marker, - common_json & out); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e284a58d1c..0dd1d7b162 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -199,7 +199,6 @@ llama_build_and_test(test-jinja.cpp) llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python) llama_build_and_test(test-chat-auto-parser.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) llama_build_and_test(test-chat-template.cpp) -llama_build_and_test(test-json-partial.cpp) llama_build_and_test(test-log.cpp) llama_build_and_test( test-peg-parser.cpp diff --git a/tests/test-json-partial.cpp b/tests/test-json-partial.cpp deleted file mode 100644 index 39da9276ef..0000000000 --- a/tests/test-json-partial.cpp +++ /dev/null @@ -1,287 +0,0 @@ -#include "common.h" -#include "json-partial.h" -#include -#include -#include - -template static void assert_equals(const T & expected, const T & actual) { - if (expected != actual) { - std::cerr << "Expected: " << expected << std::endl; - std::cerr << "Actual: " << actual << std::endl; - std::cerr << std::flush; - throw std::runtime_error("Test failed"); - } -} - -static void test_json_healing() { - auto parse = [](const std::string & str) { - std::cerr << "# Parsing: " << str << '\n'; - std::string::const_iterator it = str.begin(); - const auto end = str.end(); - common_json out; - std::string healing_marker = "$llama.cpp.json$"; - if (common_json_parse(it, end, healing_marker, out)) { - auto dump = out.json.dump(); - std::cerr << "Parsed: " << dump << '\n'; - std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n'; - std::string result; - if (!out.healing_marker.json_dump_marker.empty()) { - auto i = dump.find(out.healing_marker.json_dump_marker); - if (i == std::string::npos) { - throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")"); - } - result = dump.substr(0, i); - } else { - result = dump; - } - std::cerr << "Result: " << result << '\n'; - if (string_starts_with(str, result)) { - std::cerr << "Failure!\n"; - } - // return dump; - } else { - throw std::runtime_error("Failed to parse: " + str); - } - - }; - auto parse_all = [&](const std::string & str) { - for (size_t i = 1; i < str.size(); i++) { - parse(str.substr(0, i)); - } - }; - parse_all("{\"a\": \"b\"}"); - parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}"); - - parse_all("[{\"a\": \"b\"}]"); - - auto test = [&](const std::vector & inputs, const std::string & expected, const std::string & expected_marker) { - for (const auto & input : inputs) { - common_json out; - assert_equals(true, common_json_parse(input, "$foo", out)); - assert_equals(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true)); - assert_equals(expected_marker, out.healing_marker.json_dump_marker); - } - }; - // No healing needed: - test( - { - R"([{"a":"b"}, "y"])", - }, - R"([{"a":"b"},"y"])", - "" - ); - // Partial literals can't be healed: - test( - { - R"([1)", - R"([tru)", - R"([n)", - R"([nul)", - R"([23.2)", - }, - R"(["$foo"])", - R"("$foo)" - ); - test( - { - R"({"a": 1)", - R"({"a": tru)", - R"({"a": n)", - R"({"a": nul)", - R"({"a": 23.2)", - }, - R"({"a":"$foo"})", - R"("$foo)" - ); - test( - { - R"({)", - }, - R"({"$foo":1})", - R"("$foo)" - ); - test( - { - R"([)", - }, - R"(["$foo"])", - R"("$foo)" - ); - // Healing right after a full literal - test( - { - R"(1 )", - }, - R"(1)", - "" - ); - test( - { - R"(true)", - R"(true )", - }, - R"(true)", - "" - ); - test( - { - R"(null)", - R"(null )", - }, - R"(null)", - "" - ); - test( - { - R"([1 )", - }, - R"([1,"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([{})", - R"([{} )", - }, - R"([{},"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([true)", - }, - // TODO: detect the true/false/null literal was complete - R"(["$foo"])", - R"("$foo)" - ); - test( - { - R"([true )", - }, - R"([true,"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([true,)", - }, - R"([true,"$foo"])", - R"("$foo)" - ); - // Test nesting - test( - { - R"([{"a": [{"b": [{)", - }, - R"([{"a":[{"b":[{"$foo":1}]}]}])", - R"("$foo)" - ); - test( - { - R"([{"a": [{"b": [)", - }, - R"([{"a":[{"b":["$foo"]}]}])", - R"("$foo)" - ); - - test( - { - R"([{"a": "b"})", - R"([{"a": "b"} )", - }, - R"([{"a":"b"},"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([{"a": "b"},)", - R"([{"a": "b"}, )", - }, - R"([{"a":"b"},"$foo"])", - R"("$foo)" - ); - test( - { - R"({ "code)", - }, - R"({"code$foo":1})", - R"($foo)" - ); - test( - { - R"({ "code\)", - }, - R"({"code\\$foo":1})", - R"(\$foo)" - ); - test( - { - R"({ "code")", - }, - R"({"code":"$foo"})", - R"(:"$foo)" - ); - test( - { - R"({ "key")", - }, - R"({"key":"$foo"})", - R"(:"$foo)" - ); - // Test unicode escape sequences - test( - { - R"({"a":"\u)", - }, - R"({"a":"\u0000$foo"})", - R"(0000$foo)" - ); - test( - { - R"({"a":"\u00)", - }, - R"({"a":"\u0000$foo"})", - R"(00$foo)" - ); - test( - { - R"({"a":"\ud300)", - }, - R"({"a":"\ud300$foo"})", - R"($foo)" - ); - test( - { - R"({"a":"\ud800)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"(\udc00$foo)" - ); - test( - { - R"({"a":"\ud800\)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"(udc00$foo)" - ); - test( - { - R"({"a":"\ud800\u)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"(dc00$foo)" - ); - test( - { - R"({"a":"\ud800\udc00)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"($foo)" - ); -} - -int main() { - test_json_healing(); - std::cerr << "All tests passed.\n"; - return 0; -}