common : gracefully handle incomplete output (#20191)

* common : handle incomplete UTF-8 at end of input in PEG parser

* cont : if reached end prematurely, emit needs_more_input to propagate partial output

* cont: refactor peg parse context to add lenient flag

* cont : remove partial flag, keep lenient flag
This commit is contained in:
Aldehir Rojas
2026-03-08 11:17:02 -05:00
committed by GitHub
parent 9b24886f78
commit 451ef08432
11 changed files with 139 additions and 125 deletions

View File

@@ -167,8 +167,8 @@ void tag_based_peg_mapper::from_ast(const common_peg_ast_arena & arena, const co
});
}
tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, bool is_partial) const {
common_peg_parse_context ctx(input, is_partial);
tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags) const {
common_peg_parse_context ctx(input, flags | extra_flags);
auto parse_result = arena.parse(ctx);
tag_based_peg_mapper mapper;
@@ -179,11 +179,10 @@ tagged_parse_result tagged_peg_parser::parse_and_extract(const std::string & inp
tagged_parse_result tagged_peg_parser::parse_anywhere_and_extract(const std::string & input) const {
if (input.empty()) {
return parse_and_extract(input, false);
return parse_and_extract(input);
}
for (size_t i = 0; i < input.size(); i++) {
common_peg_parse_context ctx(input, false);
ctx.debug = debug;
common_peg_parse_context ctx(input, flags);
auto parse_result = arena.parse(ctx, i);
if (parse_result.success() || i == input.size() - 1) {
tag_based_peg_mapper mapper;

View File

@@ -155,19 +155,19 @@ struct tagged_parse_result {
struct tagged_peg_parser {
common_peg_arena arena;
bool debug = false;
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE;
tagged_peg_parser & withDebug() {
debug = true;
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
return *this;
}
tagged_peg_parser & withoutDebug() {
debug = false;
flags = flags & ~COMMON_PEG_PARSE_FLAG_DEBUG;
return *this;
}
tagged_parse_result parse_and_extract(const std::string & input, bool is_partial = false) const;
tagged_parse_result parse_and_extract(const std::string & input, common_peg_parse_flags extra_flags = COMMON_PEG_PARSE_FLAG_NONE) const;
tagged_parse_result parse_anywhere_and_extract(const std::string & input) const;
};

View File

@@ -1527,8 +1527,12 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
LOG_DBG("Parsing PEG input with format %s: %s\n", common_chat_format_name(params.format), input.c_str());
common_peg_parse_context ctx(input, is_partial);
ctx.debug = params.debug;
common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_LENIENT;
if (params.debug) {
flags |= COMMON_PEG_PARSE_FLAG_DEBUG;
}
common_peg_parse_context ctx(input, flags);
auto result = parser.parse(ctx);
if (result.fail()) {
@@ -1541,7 +1545,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for partial parse (fail):\n%s\n", ctx.ast.dump().c_str());
fflush(stderr);
}
@@ -1557,7 +1561,7 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars
auto mapper = common_chat_peg_mapper(msg);
mapper.from_ast(ctx.ast, result);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "\nAST for %s parse:\n%s\n", is_partial ? "partial" : "full", ctx.ast.dump().c_str());
fflush(stderr);
}

View File

@@ -349,7 +349,7 @@ struct parser_executor {
auto pos = start_pos;
for (auto i = 0u; i < p.literal.size(); ++i) {
if (pos >= ctx.input.size()) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -364,7 +364,7 @@ struct parser_executor {
}
common_peg_parse_result operator()(const common_peg_sequence_parser & p) {
if (ctx.debug) {
if (ctx.is_debug()) {
LOG_DBG("%sSEQ start at %zu '%s' (%zu children)\n", debug_indent().c_str(), start_pos,
debug_input_snippet(start_pos).c_str(), p.children.size());
}
@@ -375,26 +375,19 @@ struct parser_executor {
for (size_t i = 0; i < p.children.size(); i++) {
const auto & child_id = p.children[i];
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ child %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
}
auto result = arena.parse(child_id, ctx, pos);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ child %zu: %s at %zu->%zu\n", debug_indent().c_str(), i,
common_peg_parse_result_type_name(result.type), result.start, result.end);
}
if (result.fail()) {
ctx.parse_depth--;
if (ctx.is_partial && result.end >= ctx.input.size()) {
if (ctx.debug) {
fprintf(stderr, "%sSEQ -> NEED_MORE (child failed at end)\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end,
std::move(nodes));
}
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ -> FAIL\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, result.end);
@@ -406,7 +399,7 @@ struct parser_executor {
if (result.need_more_input()) {
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ -> NEED_MORE\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, result.end, std::move(nodes));
@@ -416,14 +409,14 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sSEQ -> SUCCESS at %zu->%zu\n", debug_indent().c_str(), start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos, std::move(nodes));
}
common_peg_parse_result operator()(const common_peg_choice_parser & p) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE start at %zu '%s' (%zu options)\n", debug_indent().c_str(), start_pos,
debug_input_snippet(start_pos).c_str(), p.children.size());
}
@@ -432,17 +425,17 @@ struct parser_executor {
auto pos = start_pos;
for (size_t i = 0; i < p.children.size(); i++) {
const auto & child_id = p.children[i];
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i, arena.dump(child_id).c_str());
}
auto result = arena.parse(child_id, ctx, pos);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE option %zu: %s\n", debug_indent().c_str(), i,
common_peg_parse_result_type_name(result.type));
}
if (!result.fail()) {
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE -> %s (option %zu)\n", debug_indent().c_str(),
common_peg_parse_result_type_name(result.type), i);
}
@@ -451,14 +444,14 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sCHOICE -> FAIL (no options matched)\n", debug_indent().c_str());
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
common_peg_parse_result operator()(const common_peg_repetition_parser & p) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT start at %zu '%s' (min=%d, max=%d)\n", debug_indent().c_str(), start_pos,
debug_input_snippet(start_pos).c_str(), p.min_count, p.max_count);
}
@@ -471,7 +464,7 @@ struct parser_executor {
// Try to match up to max_count times (or unlimited if max_count is -1)
while (p.max_count == -1 || match_count < p.max_count) {
if (pos >= ctx.input.size()) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT: at end of input, count=%d\n", debug_indent().c_str(), match_count);
}
break;
@@ -479,7 +472,7 @@ struct parser_executor {
auto result = arena.parse(p.child, ctx, pos);
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT iter %d: %s at %zu->%zu, nodes=%zu\n", debug_indent().c_str(), match_count,
common_peg_parse_result_type_name(result.type), result.start, result.end, result.nodes.size());
fprintf(stderr, "%sREPEAT CHILD: %s\n", debug_indent().c_str(), arena.dump(p.child).c_str());
@@ -488,7 +481,7 @@ struct parser_executor {
if (result.success()) {
// Prevent infinite loop on empty matches
if (result.end == pos) {
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%s REPEAT: empty match, stopping\n", debug_indent().c_str());
}
break;
@@ -509,7 +502,7 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> NEED_MORE (count=%d, nodes=%zu)\n", debug_indent().c_str(),
match_count, nodes.size());
}
@@ -517,7 +510,7 @@ struct parser_executor {
}
// Child failed - stop trying
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT: child failed, stopping\n", debug_indent().c_str());
}
break;
@@ -526,14 +519,14 @@ struct parser_executor {
// Check if we got enough matches
if (p.min_count > 0 && match_count < p.min_count) {
ctx.parse_depth--;
if (pos >= ctx.input.size() && ctx.is_partial) {
if (ctx.debug) {
if (pos >= ctx.input.size() && ctx.is_lenient()) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> NEED_MORE (not enough matches: %d < %d)\n", debug_indent().c_str(),
match_count, p.min_count);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos, std::move(nodes));
}
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> FAIL (not enough matches: %d < %d)\n", debug_indent().c_str(), match_count,
p.min_count);
}
@@ -541,7 +534,7 @@ struct parser_executor {
}
ctx.parse_depth--;
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sREPEAT -> SUCCESS (count=%d, nodes=%zu)\n", debug_indent().c_str(), match_count,
nodes.size());
}
@@ -576,7 +569,7 @@ struct parser_executor {
auto result = common_parse_utf8_codepoint(ctx.input, start_pos);
if (result.status == utf8_parse_result::INCOMPLETE) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos);
@@ -615,7 +608,7 @@ struct parser_executor {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_SUCCESS, start_pos, pos);
}
// Not enough matches yet
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -656,7 +649,7 @@ struct parser_executor {
// Check if we got enough matches
if (match_count < p.min_count) {
if (pos >= ctx.input.size() && ctx.is_partial) {
if (pos >= ctx.input.size() && ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
@@ -668,7 +661,7 @@ struct parser_executor {
static common_peg_parse_result handle_escape_sequence(common_peg_parse_context & ctx, size_t start, size_t & pos) {
++pos; // consume '\'
if (pos >= ctx.input.size()) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
@@ -698,7 +691,7 @@ struct parser_executor {
++pos; // consume 'u'
for (int i = 0; i < 4; ++i) {
if (pos >= ctx.input.size()) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start, pos);
@@ -732,7 +725,7 @@ struct parser_executor {
auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -747,7 +740,7 @@ struct parser_executor {
}
// Reached end without finding closing quote
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -774,7 +767,7 @@ struct parser_executor {
auto utf8_result = common_parse_utf8_codepoint(ctx.input, pos);
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -789,7 +782,7 @@ struct parser_executor {
}
// Reached end without finding closing quote
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos, pos);
}
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, pos);
@@ -807,7 +800,7 @@ struct parser_executor {
if (utf8_result.status == utf8_parse_result::INCOMPLETE) {
// Incomplete UTF-8 sequence
if (!ctx.is_partial) {
if (!ctx.is_lenient()) {
// Input is complete but UTF-8 is incomplete = malformed
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_FAIL, start_pos);
}
@@ -837,7 +830,7 @@ struct parser_executor {
last_valid_pos = pos;
}
if (last_valid_pos == ctx.input.size() && ctx.is_partial) {
if (last_valid_pos == ctx.input.size() && ctx.is_lenient()) {
// Reached the end of a partial stream, there might still be more input that we need to consume.
return common_peg_parse_result(COMMON_PEG_PARSE_RESULT_NEED_MORE_INPUT, start_pos, last_valid_pos);
}
@@ -876,7 +869,7 @@ struct parser_executor {
common_peg_parse_result operator()(const common_peg_tag_parser & p) {
// Parse the child
if (ctx.debug) {
if (ctx.is_debug()) {
fprintf(stderr, "%sTAG: %s\n", debug_indent().c_str(), p.tag.c_str());
}
auto result = arena.parse(p.child, ctx, start_pos);

View File

@@ -139,22 +139,43 @@ struct common_peg_parse_result {
bool success() const { return type == COMMON_PEG_PARSE_RESULT_SUCCESS; }
};
enum common_peg_parse_flags {
COMMON_PEG_PARSE_FLAG_NONE = 0,
COMMON_PEG_PARSE_FLAG_LENIENT = 1 << 0,
COMMON_PEG_PARSE_FLAG_DEBUG = 1 << 1,
};
inline common_peg_parse_flags operator|(common_peg_parse_flags a, common_peg_parse_flags b) {
return static_cast<common_peg_parse_flags>(int(a) | int(b));
}
inline common_peg_parse_flags & operator|=(common_peg_parse_flags & a, common_peg_parse_flags b) {
return a = a | b;
}
inline common_peg_parse_flags operator&(common_peg_parse_flags a, common_peg_parse_flags b) {
return static_cast<common_peg_parse_flags>(int(a) & int(b));
}
inline common_peg_parse_flags operator~(common_peg_parse_flags a) {
return static_cast<common_peg_parse_flags>(~int(a));
}
struct common_peg_parse_context {
std::string input;
bool is_partial;
bool debug = false; // Enable debug output for parser tracing
common_peg_parse_flags flags;
common_peg_ast_arena ast;
int parse_depth;
common_peg_parse_context()
: is_partial(false), parse_depth(0) {}
common_peg_parse_context(common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
: flags(flags), parse_depth(0) {}
common_peg_parse_context(const std::string & input)
: input(input), is_partial(false), parse_depth(0) {}
common_peg_parse_context(const std::string & input, common_peg_parse_flags flags = COMMON_PEG_PARSE_FLAG_NONE)
: input(input), flags(flags), parse_depth(0) {}
common_peg_parse_context(const std::string & input, bool is_partial)
: input(input), is_partial(is_partial), parse_depth(0) {}
bool is_lenient() const { return flags & COMMON_PEG_PARSE_FLAG_LENIENT; }
bool is_debug() const { return flags & COMMON_PEG_PARSE_FLAG_DEBUG; }
};
class common_peg_arena;