This commit is contained in:
jif-oai
2026-02-24 12:50:36 +00:00
parent a670d313a6
commit 93e4b538a3
7 changed files with 165 additions and 14 deletions

View File

@@ -6466,6 +6466,17 @@ async fn try_run_sampling_request(
}
};
flush_citation_segments_all(
&sess,
&turn_context,
plan_mode_state.as_mut(),
&mut assistant_message_stream_parsers,
)
.await;
if let Some(state) = plan_mode_state.as_mut() {
flush_proposed_plan_segments_all(&sess, &turn_context, state).await;
}
drain_in_flight(&mut in_flight, sess.clone(), turn_context.clone()).await?;
if should_emit_turn_diff {

View File

@@ -227,7 +227,7 @@ pub(crate) fn last_assistant_message_from_item(
}
let stripped = strip_hidden_assistant_markup(&combined, plan_mode);
if stripped.trim().is_empty() {
return Some(String::new());
return None;
}
return Some(stripped);
}
@@ -320,12 +320,16 @@ mod tests {
}
#[test]
fn last_assistant_message_from_item_returns_empty_string_for_citation_only_message() {
fn last_assistant_message_from_item_returns_none_for_citation_only_message() {
let item = assistant_output_text("<citation>doc1</citation>");
let message = last_assistant_message_from_item(&item, false)
.expect("assistant item should still count as latest message");
assert_eq!(last_assistant_message_from_item(&item, false), None);
}
assert_eq!(message, "");
#[test]
fn last_assistant_message_from_item_returns_none_for_plan_only_hidden_message() {
let item = assistant_output_text("<proposed_plan>\n- x\n</proposed_plan>");
assert_eq!(last_assistant_message_from_item(&item, true), None);
}
}

View File

@@ -92,5 +92,9 @@ assert_eq!(out.extracted[0].content, "x");
- No tag attributes
- No nested tag support
- Unterminated open tags are auto-closed on `finish()` (buffered content is returned as extracted)
- `Utf8StreamParser::push_bytes(...)` rolls back the entire pushed chunk on invalid UTF-8 so the
wrapped parser does not observe a partial prefix from that chunk
- `Utf8StreamParser::into_inner()` returns an error if undecoded UTF-8 bytes are still buffered;
use `into_inner_lossy()` only if you intentionally want to drop buffered partial bytes
- `StreamTextParser::push_str(...)` accepts only valid UTF-8 (`&str`); use `Utf8StreamParser` if your
upstream source yields raw bytes

View File

@@ -15,6 +15,9 @@ const CITATION_CLOSE: &str = "</citation>";
///
/// This is a thin convenience wrapper around [`InlineHiddenTagParser`]. It returns citation bodies
/// as plain strings and omits the citation tags from visible text.
///
/// Matching is literal and non-nested. If EOF is reached before a closing `</citation>`, the
/// parser auto-closes the tag and returns the buffered body as an extracted citation.
#[derive(Debug)]
pub struct CitationStreamParser {
inner: InlineHiddenTagParser<CitationTag>,
@@ -59,6 +62,9 @@ impl StreamTextParser for CitationStreamParser {
}
/// Strip citation tags from a complete string and return `(visible_text, citations)`.
///
/// This uses [`CitationStreamParser`] internally, so it inherits the same semantics:
/// literal, non-nested matching and auto-closing unterminated citations at EOF.
pub fn strip_citations(text: &str) -> (String, Vec<String>) {
let mut parser = CitationStreamParser::new();
let mut out = parser.push_str(text);
@@ -146,4 +152,21 @@ mod tests {
assert_eq!(visible, "abc");
assert_eq!(citations, vec!["one".to_string(), "two".to_string()]);
}
#[test]
fn strip_citations_auto_closes_unterminated_citation_at_eof() {
let (visible, citations) = strip_citations("x<citation>y");
assert_eq!(visible, "x");
assert_eq!(citations, vec!["y".to_string()]);
}
#[test]
fn citation_parser_does_not_support_nested_tags() {
let (visible, citations) =
strip_citations("a<citation>x<citation>y</citation>z</citation>b");
assert_eq!(visible, "az</citation>b");
assert_eq!(citations, vec!["x<citation>y".to_string()]);
}
}

View File

@@ -52,6 +52,16 @@ where
!specs.is_empty(),
"InlineHiddenTagParser requires at least one tag spec"
);
for spec in &specs {
assert!(
!spec.open.is_empty(),
"InlineHiddenTagParser requires non-empty open delimiters"
);
assert!(
!spec.close.is_empty(),
"InlineHiddenTagParser requires non-empty close delimiters"
);
}
Self {
specs,
pending: String::new(),
@@ -290,4 +300,24 @@ mod tests {
assert_eq!(out.extracted[0].tag, Tag::B);
assert_eq!(out.extracted[0].content, "y");
}
#[test]
#[should_panic(expected = "non-empty open delimiters")]
fn generic_inline_parser_rejects_empty_open_delimiter() {
let _ = InlineHiddenTagParser::new(vec![InlineTagSpec {
tag: Tag::A,
open: "",
close: "</a>",
}]);
}
#[test]
#[should_panic(expected = "non-empty close delimiters")]
fn generic_inline_parser_rejects_empty_close_delimiter() {
let _ = InlineHiddenTagParser::new(vec![InlineTagSpec {
tag: Tag::A,
open: "<a>",
close: "",
}]);
}
}

View File

@@ -1,11 +1,3 @@
//! Streaming parsers for text that arrives in chunks.
//!
//! This crate is intentionally small and dependency-free. It provides:
//! - a generic [`StreamTextParser`] trait for incremental text parsers, and
//! - reusable parsers for hidden inline tags such as `<citation>...</citation>`.
//!
//! See the crate `README.md` for usage examples and extension guidance.
mod citation;
mod inline_hidden_tag;
mod stream_text;

View File

@@ -58,6 +58,11 @@ where
}
}
/// Feed a raw byte chunk.
///
/// If the chunk contains invalid UTF-8, this returns an error and rolls back the entire
/// pushed chunk so callers can decide how to recover without the inner parser seeing a partial
/// prefix from that chunk.
pub fn push_bytes(
&mut self,
chunk: &[u8],
@@ -143,7 +148,31 @@ where
Ok(out)
}
pub fn into_inner(self) -> P {
/// Return the wrapped parser if no undecoded UTF-8 bytes are buffered.
///
/// Use [`Self::finish`] first if you want to flush buffered text into the wrapped parser.
pub fn into_inner(self) -> Result<P, Utf8StreamParserError> {
if self.pending_utf8.is_empty() {
return Ok(self.inner);
}
match std::str::from_utf8(&self.pending_utf8) {
Ok(_) => Ok(self.inner),
Err(err) => {
if let Some(error_len) = err.error_len() {
return Err(Utf8StreamParserError::InvalidUtf8 {
valid_up_to: err.valid_up_to(),
error_len,
});
}
Err(Utf8StreamParserError::IncompleteUtf8AtEof)
}
}
}
/// Return the wrapped parser without validating or flushing buffered undecoded bytes.
///
/// This may drop a partial UTF-8 code point that was buffered across chunk boundaries.
pub fn into_inner_lossy(self) -> P {
self.inner
}
}
@@ -154,6 +183,7 @@ mod tests {
use super::Utf8StreamParserError;
use crate::CitationStreamParser;
use crate::StreamTextChunk;
use crate::StreamTextParser;
use pretty_assertions::assert_eq;
@@ -223,6 +253,31 @@ mod tests {
assert!(tail.is_empty());
}
#[test]
fn utf8_stream_parser_rolls_back_entire_chunk_when_invalid_byte_follows_valid_prefix() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let err = match parser.push_bytes(b"ok\xFF") {
Ok(out) => panic!("invalid byte should error, got output: {out:?}"),
Err(err) => err,
};
assert_eq!(
err,
Utf8StreamParserError::InvalidUtf8 {
valid_up_to: 2,
error_len: 1,
}
);
let next = match parser.push_bytes(b"!") {
Ok(out) => out,
Err(err) => panic!("parser should recover after rollback: {err}"),
};
assert_eq!(next.visible_text, "!");
assert!(next.extracted.is_empty());
}
#[test]
fn utf8_stream_parser_errors_on_incomplete_code_point_at_eof() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
@@ -239,4 +294,36 @@ mod tests {
};
assert_eq!(err, Utf8StreamParserError::IncompleteUtf8AtEof);
}
#[test]
fn utf8_stream_parser_into_inner_errors_when_partial_code_point_is_buffered() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let out = match parser.push_bytes(&[0xC3]) {
Ok(out) => out,
Err(err) => panic!("partial code point should be buffered: {err}"),
};
assert!(out.is_empty());
let err = match parser.into_inner() {
Ok(_) => panic!("buffered partial code point should be rejected"),
Err(err) => err,
};
assert_eq!(err, Utf8StreamParserError::IncompleteUtf8AtEof);
}
#[test]
fn utf8_stream_parser_into_inner_lossy_drops_buffered_partial_code_point() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let out = match parser.push_bytes(&[0xC3]) {
Ok(out) => out,
Err(err) => panic!("partial code point should be buffered: {err}"),
};
assert!(out.is_empty());
let mut inner = parser.into_inner_lossy();
let tail = inner.finish();
assert!(tail.is_empty());
}
}