feat: adding stream parser (#12666)

Add a stream parser to extract citations (and others) from a stream.
This support cases where markers are split in differen tokens.

Codex never manage to make this code work so everything was done
manually. Please review correctly and do not touch this part of the code
without a very clear understanding of it
This commit is contained in:
jif-oai
2026-02-25 13:27:58 +00:00
committed by GitHub
parent 5a9a5b51b2
commit 5441130e0a
20 changed files with 2070 additions and 371 deletions

View File

@@ -0,0 +1,130 @@
use crate::CitationStreamParser;
use crate::ProposedPlanParser;
use crate::ProposedPlanSegment;
use crate::StreamTextChunk;
use crate::StreamTextParser;
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct AssistantTextChunk {
pub visible_text: String,
pub citations: Vec<String>,
pub plan_segments: Vec<ProposedPlanSegment>,
}
impl AssistantTextChunk {
pub fn is_empty(&self) -> bool {
self.visible_text.is_empty() && self.citations.is_empty() && self.plan_segments.is_empty()
}
}
/// Parses assistant text streaming markup in one pass:
/// - strips `<oai-mem-citation>` tags and extracts citation payloads
/// - in plan mode, also strips `<proposed_plan>` blocks and emits plan segments
#[derive(Debug, Default)]
pub struct AssistantTextStreamParser {
plan_mode: bool,
citations: CitationStreamParser,
plan: ProposedPlanParser,
}
impl AssistantTextStreamParser {
pub fn new(plan_mode: bool) -> Self {
Self {
plan_mode,
..Self::default()
}
}
pub fn push_str(&mut self, chunk: &str) -> AssistantTextChunk {
let citation_chunk = self.citations.push_str(chunk);
let mut out = self.parse_visible_text(citation_chunk.visible_text);
out.citations = citation_chunk.extracted;
out
}
pub fn finish(&mut self) -> AssistantTextChunk {
let citation_chunk = self.citations.finish();
let mut out = self.parse_visible_text(citation_chunk.visible_text);
if self.plan_mode {
let mut tail = self.plan.finish();
if !tail.is_empty() {
out.visible_text.push_str(&tail.visible_text);
out.plan_segments.append(&mut tail.extracted);
}
}
out.citations = citation_chunk.extracted;
out
}
fn parse_visible_text(&mut self, visible_text: String) -> AssistantTextChunk {
if !self.plan_mode {
return AssistantTextChunk {
visible_text,
..AssistantTextChunk::default()
};
}
let plan_chunk: StreamTextChunk<ProposedPlanSegment> = self.plan.push_str(&visible_text);
AssistantTextChunk {
visible_text: plan_chunk.visible_text,
plan_segments: plan_chunk.extracted,
..AssistantTextChunk::default()
}
}
}
#[cfg(test)]
mod tests {
use super::AssistantTextStreamParser;
use crate::ProposedPlanSegment;
use pretty_assertions::assert_eq;
#[test]
fn parses_citations_across_seed_and_delta_boundaries() {
let mut parser = AssistantTextStreamParser::new(false);
let seeded = parser.push_str("hello <oai-mem-citation>doc");
let parsed = parser.push_str("1</oai-mem-citation> world");
let tail = parser.finish();
assert_eq!(seeded.visible_text, "hello ");
assert_eq!(seeded.citations, Vec::<String>::new());
assert_eq!(parsed.visible_text, " world");
assert_eq!(parsed.citations, vec!["doc1".to_string()]);
assert_eq!(tail.visible_text, "");
assert_eq!(tail.citations, Vec::<String>::new());
}
#[test]
fn parses_plan_segments_after_citation_stripping() {
let mut parser = AssistantTextStreamParser::new(true);
let seeded = parser.push_str("Intro\n<proposed");
let parsed = parser.push_str("_plan>\n- step <oai-mem-citation>doc</oai-mem-citation>\n");
let tail = parser.push_str("</proposed_plan>\nOutro");
let finish = parser.finish();
assert_eq!(seeded.visible_text, "Intro\n");
assert_eq!(
seeded.plan_segments,
vec![ProposedPlanSegment::Normal("Intro\n".to_string())]
);
assert_eq!(parsed.visible_text, "");
assert_eq!(parsed.citations, vec!["doc".to_string()]);
assert_eq!(
parsed.plan_segments,
vec![
ProposedPlanSegment::ProposedPlanStart,
ProposedPlanSegment::ProposedPlanDelta("- step \n".to_string()),
]
);
assert_eq!(tail.visible_text, "Outro");
assert_eq!(
tail.plan_segments,
vec![
ProposedPlanSegment::ProposedPlanEnd,
ProposedPlanSegment::Normal("Outro".to_string()),
]
);
assert!(finish.is_empty());
}
}

View File

@@ -0,0 +1,179 @@
use crate::InlineHiddenTagParser;
use crate::InlineTagSpec;
use crate::StreamTextChunk;
use crate::StreamTextParser;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CitationTag {
Citation,
}
const CITATION_OPEN: &str = "<oai-mem-citation>";
const CITATION_CLOSE: &str = "</oai-mem-citation>";
/// Stream parser for `<oai-mem-citation>...</oai-mem-citation>` tags.
///
/// This is a thin convenience wrapper around [`InlineHiddenTagParser`]. It returns citation bodies
/// as plain strings and omits the citation tags from visible text.
///
/// Matching is literal and non-nested. If EOF is reached before a closing
/// `</oai-mem-citation>`, the parser auto-closes the tag and returns the buffered body as an
/// extracted citation.
#[derive(Debug)]
pub struct CitationStreamParser {
inner: InlineHiddenTagParser<CitationTag>,
}
impl CitationStreamParser {
pub fn new() -> Self {
Self {
inner: InlineHiddenTagParser::new(vec![InlineTagSpec {
tag: CitationTag::Citation,
open: CITATION_OPEN,
close: CITATION_CLOSE,
}]),
}
}
}
impl Default for CitationStreamParser {
fn default() -> Self {
Self::new()
}
}
impl StreamTextParser for CitationStreamParser {
type Extracted = String;
fn push_str(&mut self, chunk: &str) -> StreamTextChunk<Self::Extracted> {
let inner = self.inner.push_str(chunk);
StreamTextChunk {
visible_text: inner.visible_text,
extracted: inner.extracted.into_iter().map(|tag| tag.content).collect(),
}
}
fn finish(&mut self) -> StreamTextChunk<Self::Extracted> {
let inner = self.inner.finish();
StreamTextChunk {
visible_text: inner.visible_text,
extracted: inner.extracted.into_iter().map(|tag| tag.content).collect(),
}
}
}
/// Strip citation tags from a complete string and return `(visible_text, citations)`.
///
/// This uses [`CitationStreamParser`] internally, so it inherits the same semantics:
/// literal, non-nested matching and auto-closing unterminated citations at EOF.
pub fn strip_citations(text: &str) -> (String, Vec<String>) {
let mut parser = CitationStreamParser::new();
let mut out = parser.push_str(text);
let tail = parser.finish();
out.visible_text.push_str(&tail.visible_text);
out.extracted.extend(tail.extracted);
(out.visible_text, out.extracted)
}
#[cfg(test)]
mod tests {
use super::CitationStreamParser;
use super::strip_citations;
use crate::StreamTextChunk;
use crate::StreamTextParser;
use pretty_assertions::assert_eq;
fn collect_chunks<P>(parser: &mut P, chunks: &[&str]) -> StreamTextChunk<P::Extracted>
where
P: StreamTextParser,
{
let mut all = StreamTextChunk::default();
for chunk in chunks {
let next = parser.push_str(chunk);
all.visible_text.push_str(&next.visible_text);
all.extracted.extend(next.extracted);
}
let tail = parser.finish();
all.visible_text.push_str(&tail.visible_text);
all.extracted.extend(tail.extracted);
all
}
#[test]
fn citation_parser_streams_across_chunk_boundaries() {
let mut parser = CitationStreamParser::new();
let out = collect_chunks(
&mut parser,
&[
"Hello <oai-mem-",
"citation>source A</oai-mem-",
"citation> world",
],
);
assert_eq!(out.visible_text, "Hello world");
assert_eq!(out.extracted, vec!["source A".to_string()]);
}
#[test]
fn citation_parser_buffers_partial_open_tag_prefix() {
let mut parser = CitationStreamParser::new();
let first = parser.push_str("abc <oai-mem-");
assert_eq!(first.visible_text, "abc ");
assert_eq!(first.extracted, Vec::<String>::new());
let second = parser.push_str("citation>x</oai-mem-citation>z");
let tail = parser.finish();
assert_eq!(second.visible_text, "z");
assert_eq!(second.extracted, vec!["x".to_string()]);
assert!(tail.is_empty());
}
#[test]
fn citation_parser_auto_closes_unterminated_tag_on_finish() {
let mut parser = CitationStreamParser::new();
let out = collect_chunks(&mut parser, &["x<oai-mem-citation>source"]);
assert_eq!(out.visible_text, "x");
assert_eq!(out.extracted, vec!["source".to_string()]);
}
#[test]
fn citation_parser_preserves_partial_open_tag_at_eof_if_not_a_full_tag() {
let mut parser = CitationStreamParser::new();
let out = collect_chunks(&mut parser, &["hello <oai-mem-"]);
assert_eq!(out.visible_text, "hello <oai-mem-");
assert_eq!(out.extracted, Vec::<String>::new());
}
#[test]
fn strip_citations_collects_all_citations() {
let (visible, citations) = strip_citations(
"a<oai-mem-citation>one</oai-mem-citation>b<oai-mem-citation>two</oai-mem-citation>c",
);
assert_eq!(visible, "abc");
assert_eq!(citations, vec!["one".to_string(), "two".to_string()]);
}
#[test]
fn strip_citations_auto_closes_unterminated_citation_at_eof() {
let (visible, citations) = strip_citations("x<oai-mem-citation>y");
assert_eq!(visible, "x");
assert_eq!(citations, vec!["y".to_string()]);
}
#[test]
fn citation_parser_does_not_support_nested_tags() {
let (visible, citations) = strip_citations(
"a<oai-mem-citation>x<oai-mem-citation>y</oai-mem-citation>z</oai-mem-citation>b",
);
assert_eq!(visible, "az</oai-mem-citation>b");
assert_eq!(citations, vec!["x<oai-mem-citation>y".to_string()]);
}
}

View File

@@ -0,0 +1,323 @@
use crate::StreamTextChunk;
use crate::StreamTextParser;
/// One hidden inline tag extracted by [`InlineHiddenTagParser`].
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ExtractedInlineTag<T> {
pub tag: T,
pub content: String,
}
/// Literal tag specification used by [`InlineHiddenTagParser`].
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct InlineTagSpec<T> {
pub tag: T,
pub open: &'static str,
pub close: &'static str,
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct ActiveTag<T> {
tag: T,
close: &'static str,
content: String,
}
/// Generic streaming parser that hides configured inline tags and extracts their contents.
///
/// Example:
/// - input: `hello <oai-mem-citation>doc A</oai-mem-citation> world`
/// - visible output: `hello world`
/// - extracted: `["doc A"]`
///
/// Matching is literal and non-nested. If EOF is reached while a tag is still open, the parser
/// auto-closes it and returns the buffered content as extracted data.
#[derive(Debug)]
pub struct InlineHiddenTagParser<T>
where
T: Clone + Eq,
{
specs: Vec<InlineTagSpec<T>>,
pending: String,
active: Option<ActiveTag<T>>,
}
impl<T> InlineHiddenTagParser<T>
where
T: Clone + Eq,
{
/// Create a parser for one or more hidden inline tags.
pub fn new(specs: Vec<InlineTagSpec<T>>) -> Self {
assert!(
!specs.is_empty(),
"InlineHiddenTagParser requires at least one tag spec"
);
for spec in &specs {
assert!(
!spec.open.is_empty(),
"InlineHiddenTagParser requires non-empty open delimiters"
);
assert!(
!spec.close.is_empty(),
"InlineHiddenTagParser requires non-empty close delimiters"
);
}
Self {
specs,
pending: String::new(),
active: None,
}
}
fn find_next_open(&self) -> Option<(usize, usize)> {
self.specs
.iter()
.enumerate()
.filter_map(|(idx, spec)| {
self.pending
.find(spec.open)
.map(|pos| (pos, spec.open.len(), idx))
})
.min_by(|(pos_a, len_a, idx_a), (pos_b, len_b, idx_b)| {
pos_a
.cmp(pos_b)
.then_with(|| len_b.cmp(len_a))
.then_with(|| idx_a.cmp(idx_b))
})
.map(|(pos, _len, idx)| (pos, idx))
}
fn max_open_prefix_suffix_len(&self) -> usize {
self.specs
.iter()
.map(|spec| longest_suffix_prefix_len(&self.pending, spec.open))
.max()
.map_or(0, std::convert::identity)
}
fn push_visible_prefix(out: &mut StreamTextChunk<ExtractedInlineTag<T>>, pending: &str) {
if !pending.is_empty() {
out.visible_text.push_str(pending);
}
}
fn drain_visible_to_suffix_match(
&mut self,
out: &mut StreamTextChunk<ExtractedInlineTag<T>>,
keep_suffix_len: usize,
) {
let take = self.pending.len().saturating_sub(keep_suffix_len);
if take == 0 {
return;
}
Self::push_visible_prefix(out, &self.pending[..take]);
self.pending.drain(..take);
}
}
impl<T> StreamTextParser for InlineHiddenTagParser<T>
where
T: Clone + Eq,
{
type Extracted = ExtractedInlineTag<T>;
fn push_str(&mut self, chunk: &str) -> StreamTextChunk<Self::Extracted> {
self.pending.push_str(chunk);
let mut out = StreamTextChunk::default();
loop {
if let Some(close) = self.active.as_ref().map(|active| active.close) {
if let Some(close_idx) = self.pending.find(close) {
let Some(mut active) = self.active.take() else {
continue;
};
active.content.push_str(&self.pending[..close_idx]);
out.extracted.push(ExtractedInlineTag {
tag: active.tag,
content: active.content,
});
let close_len = close.len();
self.pending.drain(..close_idx + close_len);
continue;
}
let keep = longest_suffix_prefix_len(&self.pending, close);
let take = self.pending.len().saturating_sub(keep);
if take > 0 {
if let Some(active) = self.active.as_mut() {
active.content.push_str(&self.pending[..take]);
}
self.pending.drain(..take);
}
break;
}
if let Some((open_idx, spec_idx)) = self.find_next_open() {
Self::push_visible_prefix(&mut out, &self.pending[..open_idx]);
let spec = &self.specs[spec_idx];
let open_len = spec.open.len();
self.pending.drain(..open_idx + open_len);
self.active = Some(ActiveTag {
tag: spec.tag.clone(),
close: spec.close,
content: String::new(),
});
continue;
}
let keep = self.max_open_prefix_suffix_len();
self.drain_visible_to_suffix_match(&mut out, keep);
break;
}
out
}
fn finish(&mut self) -> StreamTextChunk<Self::Extracted> {
let mut out = StreamTextChunk::default();
if let Some(mut active) = self.active.take() {
if !self.pending.is_empty() {
active.content.push_str(&self.pending);
self.pending.clear();
}
out.extracted.push(ExtractedInlineTag {
tag: active.tag,
content: active.content,
});
return out;
}
if !self.pending.is_empty() {
out.visible_text.push_str(&self.pending);
self.pending.clear();
}
out
}
}
fn longest_suffix_prefix_len(s: &str, needle: &str) -> usize {
let max = s.len().min(needle.len().saturating_sub(1));
for k in (1..=max).rev() {
if needle.is_char_boundary(k) && s.ends_with(&needle[..k]) {
return k;
}
}
0
}
#[cfg(test)]
mod tests {
use super::InlineHiddenTagParser;
use super::InlineTagSpec;
use crate::StreamTextChunk;
use crate::StreamTextParser;
use pretty_assertions::assert_eq;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Tag {
A,
B,
}
fn collect_chunks<P>(parser: &mut P, chunks: &[&str]) -> StreamTextChunk<P::Extracted>
where
P: StreamTextParser,
{
let mut all = StreamTextChunk::default();
for chunk in chunks {
let next = parser.push_str(chunk);
all.visible_text.push_str(&next.visible_text);
all.extracted.extend(next.extracted);
}
let tail = parser.finish();
all.visible_text.push_str(&tail.visible_text);
all.extracted.extend(tail.extracted);
all
}
#[test]
fn generic_inline_parser_supports_multiple_tag_types() {
let mut parser = InlineHiddenTagParser::new(vec![
InlineTagSpec {
tag: Tag::A,
open: "<a>",
close: "</a>",
},
InlineTagSpec {
tag: Tag::B,
open: "<b>",
close: "</b>",
},
]);
let out = collect_chunks(&mut parser, &["1<a>x</a>2<b>y</b>3"]);
assert_eq!(out.visible_text, "123");
assert_eq!(out.extracted.len(), 2);
assert_eq!(out.extracted[0].tag, Tag::A);
assert_eq!(out.extracted[0].content, "x");
assert_eq!(out.extracted[1].tag, Tag::B);
assert_eq!(out.extracted[1].content, "y");
}
#[test]
fn generic_inline_parser_supports_non_ascii_tag_delimiters() {
let mut parser = InlineHiddenTagParser::new(vec![InlineTagSpec {
tag: Tag::A,
open: "<é>",
close: "</é>",
}]);
let out = collect_chunks(&mut parser, &["a<", "é>中</", "é>b"]);
assert_eq!(out.visible_text, "ab");
assert_eq!(out.extracted.len(), 1);
assert_eq!(out.extracted[0].tag, Tag::A);
assert_eq!(out.extracted[0].content, "");
}
#[test]
fn generic_inline_parser_prefers_longest_opener_at_same_offset() {
let mut parser = InlineHiddenTagParser::new(vec![
InlineTagSpec {
tag: Tag::A,
open: "<a>",
close: "</a>",
},
InlineTagSpec {
tag: Tag::B,
open: "<ab>",
close: "</ab>",
},
]);
let out = collect_chunks(&mut parser, &["x<ab>y</ab>z"]);
assert_eq!(out.visible_text, "xz");
assert_eq!(out.extracted.len(), 1);
assert_eq!(out.extracted[0].tag, Tag::B);
assert_eq!(out.extracted[0].content, "y");
}
#[test]
#[should_panic(expected = "non-empty open delimiters")]
fn generic_inline_parser_rejects_empty_open_delimiter() {
let _ = InlineHiddenTagParser::new(vec![InlineTagSpec {
tag: Tag::A,
open: "",
close: "</a>",
}]);
}
#[test]
#[should_panic(expected = "non-empty close delimiters")]
fn generic_inline_parser_rejects_empty_close_delimiter() {
let _ = InlineHiddenTagParser::new(vec![InlineTagSpec {
tag: Tag::A,
open: "<a>",
close: "",
}]);
}
}

View File

@@ -0,0 +1,23 @@
mod assistant_text;
mod citation;
mod inline_hidden_tag;
mod proposed_plan;
mod stream_text;
mod tagged_line_parser;
mod utf8_stream;
pub use assistant_text::AssistantTextChunk;
pub use assistant_text::AssistantTextStreamParser;
pub use citation::CitationStreamParser;
pub use citation::strip_citations;
pub use inline_hidden_tag::ExtractedInlineTag;
pub use inline_hidden_tag::InlineHiddenTagParser;
pub use inline_hidden_tag::InlineTagSpec;
pub use proposed_plan::ProposedPlanParser;
pub use proposed_plan::ProposedPlanSegment;
pub use proposed_plan::extract_proposed_plan_text;
pub use proposed_plan::strip_proposed_plan_blocks;
pub use stream_text::StreamTextChunk;
pub use stream_text::StreamTextParser;
pub use utf8_stream::Utf8StreamParser;
pub use utf8_stream::Utf8StreamParserError;

View File

@@ -0,0 +1,212 @@
use crate::StreamTextChunk;
use crate::StreamTextParser;
use crate::tagged_line_parser::TagSpec;
use crate::tagged_line_parser::TaggedLineParser;
use crate::tagged_line_parser::TaggedLineSegment;
const OPEN_TAG: &str = "<proposed_plan>";
const CLOSE_TAG: &str = "</proposed_plan>";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum PlanTag {
ProposedPlan,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ProposedPlanSegment {
Normal(String),
ProposedPlanStart,
ProposedPlanDelta(String),
ProposedPlanEnd,
}
/// Parser for `<proposed_plan>` blocks emitted in plan mode.
///
/// Implements [`StreamTextParser`] so callers can consume:
/// - `visible_text`: normal assistant text with plan blocks removed
/// - `extracted`: ordered plan segments (includes `Normal(...)` segments for ordering fidelity)
#[derive(Debug)]
pub struct ProposedPlanParser {
parser: TaggedLineParser<PlanTag>,
}
impl ProposedPlanParser {
pub fn new() -> Self {
Self {
parser: TaggedLineParser::new(vec![TagSpec {
open: OPEN_TAG,
close: CLOSE_TAG,
tag: PlanTag::ProposedPlan,
}]),
}
}
}
impl Default for ProposedPlanParser {
fn default() -> Self {
Self::new()
}
}
impl StreamTextParser for ProposedPlanParser {
type Extracted = ProposedPlanSegment;
fn push_str(&mut self, chunk: &str) -> StreamTextChunk<Self::Extracted> {
map_segments(self.parser.parse(chunk))
}
fn finish(&mut self) -> StreamTextChunk<Self::Extracted> {
map_segments(self.parser.finish())
}
}
fn map_segments(segments: Vec<TaggedLineSegment<PlanTag>>) -> StreamTextChunk<ProposedPlanSegment> {
let mut out = StreamTextChunk::default();
for segment in segments {
let mapped = match segment {
TaggedLineSegment::Normal(text) => ProposedPlanSegment::Normal(text),
TaggedLineSegment::TagStart(PlanTag::ProposedPlan) => {
ProposedPlanSegment::ProposedPlanStart
}
TaggedLineSegment::TagDelta(PlanTag::ProposedPlan, text) => {
ProposedPlanSegment::ProposedPlanDelta(text)
}
TaggedLineSegment::TagEnd(PlanTag::ProposedPlan) => {
ProposedPlanSegment::ProposedPlanEnd
}
};
if let ProposedPlanSegment::Normal(text) = &mapped {
out.visible_text.push_str(text);
}
out.extracted.push(mapped);
}
out
}
pub fn strip_proposed_plan_blocks(text: &str) -> String {
let mut parser = ProposedPlanParser::new();
let mut out = parser.push_str(text).visible_text;
out.push_str(&parser.finish().visible_text);
out
}
pub fn extract_proposed_plan_text(text: &str) -> Option<String> {
let mut parser = ProposedPlanParser::new();
let mut plan_text = String::new();
let mut saw_plan_block = false;
for segment in parser
.push_str(text)
.extracted
.into_iter()
.chain(parser.finish().extracted)
{
match segment {
ProposedPlanSegment::ProposedPlanStart => {
saw_plan_block = true;
plan_text.clear();
}
ProposedPlanSegment::ProposedPlanDelta(delta) => {
plan_text.push_str(&delta);
}
ProposedPlanSegment::ProposedPlanEnd | ProposedPlanSegment::Normal(_) => {}
}
}
saw_plan_block.then_some(plan_text)
}
#[cfg(test)]
mod tests {
use super::ProposedPlanParser;
use super::ProposedPlanSegment;
use super::extract_proposed_plan_text;
use super::strip_proposed_plan_blocks;
use crate::StreamTextChunk;
use crate::StreamTextParser;
use pretty_assertions::assert_eq;
fn collect_chunks<P>(parser: &mut P, chunks: &[&str]) -> StreamTextChunk<P::Extracted>
where
P: StreamTextParser,
{
let mut all = StreamTextChunk::default();
for chunk in chunks {
let next = parser.push_str(chunk);
all.visible_text.push_str(&next.visible_text);
all.extracted.extend(next.extracted);
}
let tail = parser.finish();
all.visible_text.push_str(&tail.visible_text);
all.extracted.extend(tail.extracted);
all
}
#[test]
fn streams_proposed_plan_segments_and_visible_text() {
let mut parser = ProposedPlanParser::new();
let out = collect_chunks(
&mut parser,
&[
"Intro text\n<prop",
"osed_plan>\n- step 1\n",
"</proposed_plan>\nOutro",
],
);
assert_eq!(out.visible_text, "Intro text\nOutro");
assert_eq!(
out.extracted,
vec![
ProposedPlanSegment::Normal("Intro text\n".to_string()),
ProposedPlanSegment::ProposedPlanStart,
ProposedPlanSegment::ProposedPlanDelta("- step 1\n".to_string()),
ProposedPlanSegment::ProposedPlanEnd,
ProposedPlanSegment::Normal("Outro".to_string()),
]
);
}
#[test]
fn preserves_non_tag_lines() {
let mut parser = ProposedPlanParser::new();
let out = collect_chunks(&mut parser, &[" <proposed_plan> extra\n"]);
assert_eq!(out.visible_text, " <proposed_plan> extra\n");
assert_eq!(
out.extracted,
vec![ProposedPlanSegment::Normal(
" <proposed_plan> extra\n".to_string()
)]
);
}
#[test]
fn closes_unterminated_plan_block_on_finish() {
let mut parser = ProposedPlanParser::new();
let out = collect_chunks(&mut parser, &["<proposed_plan>\n- step 1\n"]);
assert_eq!(out.visible_text, "");
assert_eq!(
out.extracted,
vec![
ProposedPlanSegment::ProposedPlanStart,
ProposedPlanSegment::ProposedPlanDelta("- step 1\n".to_string()),
ProposedPlanSegment::ProposedPlanEnd,
]
);
}
#[test]
fn strips_proposed_plan_blocks_from_text() {
let text = "before\n<proposed_plan>\n- step\n</proposed_plan>\nafter";
assert_eq!(strip_proposed_plan_blocks(text), "before\nafter");
}
#[test]
fn extracts_proposed_plan_text() {
let text = "before\n<proposed_plan>\n- step\n</proposed_plan>\nafter";
assert_eq!(
extract_proposed_plan_text(text),
Some("- step\n".to_string())
);
}
}

View File

@@ -0,0 +1,36 @@
/// Incremental parser result for one pushed chunk (or final flush).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct StreamTextChunk<T> {
/// Text safe to render immediately.
pub visible_text: String,
/// Hidden payloads extracted from the chunk.
pub extracted: Vec<T>,
}
impl<T> Default for StreamTextChunk<T> {
fn default() -> Self {
Self {
visible_text: String::new(),
extracted: Vec::new(),
}
}
}
impl<T> StreamTextChunk<T> {
/// Returns true when no visible text or extracted payloads were produced.
pub fn is_empty(&self) -> bool {
self.visible_text.is_empty() && self.extracted.is_empty()
}
}
/// Trait for parsers that consume streamed text and emit visible text plus extracted payloads.
pub trait StreamTextParser {
/// Payload extracted by this parser (for example a citation body).
type Extracted;
/// Feed a new text chunk.
fn push_str(&mut self, chunk: &str) -> StreamTextChunk<Self::Extracted>;
/// Flush any buffered state at end-of-stream (or end-of-item).
fn finish(&mut self) -> StreamTextChunk<Self::Extracted>;
}

View File

@@ -0,0 +1,249 @@
//! Line-based tag block parsing for streamed text.
//!
//! The parser buffers each line until it can disprove that the line is a tag,
//! which is required for tags that must appear alone on a line.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) struct TagSpec<T> {
pub(crate) open: &'static str,
pub(crate) close: &'static str,
pub(crate) tag: T,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) enum TaggedLineSegment<T> {
Normal(String),
TagStart(T),
TagDelta(T, String),
TagEnd(T),
}
/// Stateful line parser that splits input into normal text vs tag blocks.
#[derive(Debug, Default)]
pub(crate) struct TaggedLineParser<T>
where
T: Copy + Eq,
{
specs: Vec<TagSpec<T>>,
active_tag: Option<T>,
detect_tag: bool,
line_buffer: String,
}
impl<T> TaggedLineParser<T>
where
T: Copy + Eq,
{
pub(crate) fn new(specs: Vec<TagSpec<T>>) -> Self {
Self {
specs,
active_tag: None,
detect_tag: true,
line_buffer: String::new(),
}
}
pub(crate) fn parse(&mut self, delta: &str) -> Vec<TaggedLineSegment<T>> {
let mut segments = Vec::new();
let mut run = String::new();
for ch in delta.chars() {
if self.detect_tag {
if !run.is_empty() {
self.push_text(std::mem::take(&mut run), &mut segments);
}
self.line_buffer.push(ch);
if ch == '\n' {
self.finish_line(&mut segments);
continue;
}
let slug = self.line_buffer.trim_start();
if slug.is_empty() || self.is_tag_prefix(slug) {
continue;
}
let buffered = std::mem::take(&mut self.line_buffer);
self.detect_tag = false;
self.push_text(buffered, &mut segments);
continue;
}
run.push(ch);
if ch == '\n' {
self.push_text(std::mem::take(&mut run), &mut segments);
self.detect_tag = true;
}
}
if !run.is_empty() {
self.push_text(run, &mut segments);
}
segments
}
pub(crate) fn finish(&mut self) -> Vec<TaggedLineSegment<T>> {
let mut segments = Vec::new();
if !self.line_buffer.is_empty() {
let buffered = std::mem::take(&mut self.line_buffer);
let without_newline = buffered.strip_suffix('\n').unwrap_or(&buffered);
let slug = without_newline.trim_start().trim_end();
if let Some(tag) = self.match_open(slug)
&& self.active_tag.is_none()
{
push_segment(&mut segments, TaggedLineSegment::TagStart(tag));
self.active_tag = Some(tag);
} else if let Some(tag) = self.match_close(slug)
&& self.active_tag == Some(tag)
{
push_segment(&mut segments, TaggedLineSegment::TagEnd(tag));
self.active_tag = None;
} else {
self.push_text(buffered, &mut segments);
}
}
if let Some(tag) = self.active_tag.take() {
push_segment(&mut segments, TaggedLineSegment::TagEnd(tag));
}
self.detect_tag = true;
segments
}
fn finish_line(&mut self, segments: &mut Vec<TaggedLineSegment<T>>) {
let line = std::mem::take(&mut self.line_buffer);
let without_newline = line.strip_suffix('\n').unwrap_or(&line);
let slug = without_newline.trim_start().trim_end();
if let Some(tag) = self.match_open(slug)
&& self.active_tag.is_none()
{
push_segment(segments, TaggedLineSegment::TagStart(tag));
self.active_tag = Some(tag);
self.detect_tag = true;
return;
}
if let Some(tag) = self.match_close(slug)
&& self.active_tag == Some(tag)
{
push_segment(segments, TaggedLineSegment::TagEnd(tag));
self.active_tag = None;
self.detect_tag = true;
return;
}
self.detect_tag = true;
self.push_text(line, segments);
}
fn push_text(&self, text: String, segments: &mut Vec<TaggedLineSegment<T>>) {
if let Some(tag) = self.active_tag {
push_segment(segments, TaggedLineSegment::TagDelta(tag, text));
} else {
push_segment(segments, TaggedLineSegment::Normal(text));
}
}
fn is_tag_prefix(&self, slug: &str) -> bool {
let slug = slug.trim_end();
self.specs
.iter()
.any(|spec| spec.open.starts_with(slug) || spec.close.starts_with(slug))
}
fn match_open(&self, slug: &str) -> Option<T> {
self.specs
.iter()
.find(|spec| spec.open == slug)
.map(|spec| spec.tag)
}
fn match_close(&self, slug: &str) -> Option<T> {
self.specs
.iter()
.find(|spec| spec.close == slug)
.map(|spec| spec.tag)
}
}
fn push_segment<T>(segments: &mut Vec<TaggedLineSegment<T>>, segment: TaggedLineSegment<T>)
where
T: Copy + Eq,
{
match segment {
TaggedLineSegment::Normal(delta) => {
if delta.is_empty() {
return;
}
if let Some(TaggedLineSegment::Normal(existing)) = segments.last_mut() {
existing.push_str(&delta);
return;
}
segments.push(TaggedLineSegment::Normal(delta));
}
TaggedLineSegment::TagDelta(tag, delta) => {
if delta.is_empty() {
return;
}
if let Some(TaggedLineSegment::TagDelta(existing_tag, existing)) = segments.last_mut()
&& *existing_tag == tag
{
existing.push_str(&delta);
return;
}
segments.push(TaggedLineSegment::TagDelta(tag, delta));
}
TaggedLineSegment::TagStart(tag) => segments.push(TaggedLineSegment::TagStart(tag)),
TaggedLineSegment::TagEnd(tag) => segments.push(TaggedLineSegment::TagEnd(tag)),
}
}
#[cfg(test)]
mod tests {
use super::TagSpec;
use super::TaggedLineParser;
use super::TaggedLineSegment;
use pretty_assertions::assert_eq;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Tag {
Block,
}
fn parser() -> TaggedLineParser<Tag> {
TaggedLineParser::new(vec![TagSpec {
open: "<tag>",
close: "</tag>",
tag: Tag::Block,
}])
}
#[test]
fn buffers_prefix_until_tag_is_decided() {
let mut parser = parser();
let mut segments = parser.parse("<t");
segments.extend(parser.parse("ag>\nline\n</tag>\n"));
segments.extend(parser.finish());
assert_eq!(
segments,
vec![
TaggedLineSegment::TagStart(Tag::Block),
TaggedLineSegment::TagDelta(Tag::Block, "line\n".to_string()),
TaggedLineSegment::TagEnd(Tag::Block),
]
);
}
#[test]
fn rejects_tag_lines_with_extra_text() {
let mut parser = parser();
let mut segments = parser.parse("<tag> extra\n");
segments.extend(parser.finish());
assert_eq!(
segments,
vec![TaggedLineSegment::Normal("<tag> extra\n".to_string())]
);
}
}

View File

@@ -0,0 +1,333 @@
use std::error::Error;
use std::fmt;
use crate::StreamTextChunk;
use crate::StreamTextParser;
/// Error returned by [`Utf8StreamParser`] when streamed bytes are not valid UTF-8.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Utf8StreamParserError {
/// The provided bytes contain an invalid UTF-8 sequence.
InvalidUtf8 {
/// Byte offset in the parser's buffered bytes where decoding failed.
valid_up_to: usize,
/// Length in bytes of the invalid sequence.
error_len: usize,
},
/// EOF was reached with a buffered partial UTF-8 code point.
IncompleteUtf8AtEof,
}
impl fmt::Display for Utf8StreamParserError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::InvalidUtf8 {
valid_up_to,
error_len,
} => write!(
f,
"invalid UTF-8 in streamed bytes at offset {valid_up_to} (error length {error_len})"
),
Self::IncompleteUtf8AtEof => {
write!(f, "incomplete UTF-8 code point at end of stream")
}
}
}
}
impl Error for Utf8StreamParserError {}
/// Wraps a [`StreamTextParser`] and accepts raw bytes, buffering partial UTF-8 code points.
///
/// This is useful when upstream data arrives as `&[u8]` and a code point may be split across
/// chunk boundaries (for example `0xC3` followed by `0xA9` for `é`).
#[derive(Debug)]
pub struct Utf8StreamParser<P> {
inner: P,
pending_utf8: Vec<u8>,
}
impl<P> Utf8StreamParser<P>
where
P: StreamTextParser,
{
pub fn new(inner: P) -> Self {
Self {
inner,
pending_utf8: Vec::new(),
}
}
/// Feed a raw byte chunk.
///
/// If the chunk contains invalid UTF-8, this returns an error and rolls back the entire
/// pushed chunk so callers can decide how to recover without the inner parser seeing a partial
/// prefix from that chunk.
pub fn push_bytes(
&mut self,
chunk: &[u8],
) -> Result<StreamTextChunk<P::Extracted>, Utf8StreamParserError> {
let old_len = self.pending_utf8.len();
self.pending_utf8.extend_from_slice(chunk);
match std::str::from_utf8(&self.pending_utf8) {
Ok(text) => {
let out = self.inner.push_str(text);
self.pending_utf8.clear();
Ok(out)
}
Err(err) => {
if let Some(error_len) = err.error_len() {
self.pending_utf8.truncate(old_len);
return Err(Utf8StreamParserError::InvalidUtf8 {
valid_up_to: err.valid_up_to(),
error_len,
});
}
let valid_up_to = err.valid_up_to();
if valid_up_to == 0 {
return Ok(StreamTextChunk::default());
}
let text = match std::str::from_utf8(&self.pending_utf8[..valid_up_to]) {
Ok(text) => text,
Err(prefix_err) => {
self.pending_utf8.truncate(old_len);
let error_len = prefix_err.error_len().unwrap_or(0);
return Err(Utf8StreamParserError::InvalidUtf8 {
valid_up_to: prefix_err.valid_up_to(),
error_len,
});
}
};
let out = self.inner.push_str(text);
self.pending_utf8.drain(..valid_up_to);
Ok(out)
}
}
}
pub fn finish(&mut self) -> Result<StreamTextChunk<P::Extracted>, Utf8StreamParserError> {
if !self.pending_utf8.is_empty() {
match std::str::from_utf8(&self.pending_utf8) {
Ok(_) => {}
Err(err) => {
if let Some(error_len) = err.error_len() {
return Err(Utf8StreamParserError::InvalidUtf8 {
valid_up_to: err.valid_up_to(),
error_len,
});
}
return Err(Utf8StreamParserError::IncompleteUtf8AtEof);
}
}
}
let mut out = if self.pending_utf8.is_empty() {
StreamTextChunk::default()
} else {
let text = match std::str::from_utf8(&self.pending_utf8) {
Ok(text) => text,
Err(err) => {
let error_len = err.error_len().unwrap_or(0);
return Err(Utf8StreamParserError::InvalidUtf8 {
valid_up_to: err.valid_up_to(),
error_len,
});
}
};
let out = self.inner.push_str(text);
self.pending_utf8.clear();
out
};
let mut tail = self.inner.finish();
out.visible_text.push_str(&tail.visible_text);
out.extracted.append(&mut tail.extracted);
Ok(out)
}
/// Return the wrapped parser if no undecoded UTF-8 bytes are buffered.
///
/// Use [`Self::finish`] first if you want to flush buffered text into the wrapped parser.
pub fn into_inner(self) -> Result<P, Utf8StreamParserError> {
if self.pending_utf8.is_empty() {
return Ok(self.inner);
}
match std::str::from_utf8(&self.pending_utf8) {
Ok(_) => Ok(self.inner),
Err(err) => {
if let Some(error_len) = err.error_len() {
return Err(Utf8StreamParserError::InvalidUtf8 {
valid_up_to: err.valid_up_to(),
error_len,
});
}
Err(Utf8StreamParserError::IncompleteUtf8AtEof)
}
}
}
/// Return the wrapped parser without validating or flushing buffered undecoded bytes.
///
/// This may drop a partial UTF-8 code point that was buffered across chunk boundaries.
pub fn into_inner_lossy(self) -> P {
self.inner
}
}
#[cfg(test)]
mod tests {
use super::Utf8StreamParser;
use super::Utf8StreamParserError;
use crate::CitationStreamParser;
use crate::StreamTextChunk;
use crate::StreamTextParser;
use pretty_assertions::assert_eq;
fn collect_bytes(
parser: &mut Utf8StreamParser<CitationStreamParser>,
chunks: &[&[u8]],
) -> Result<StreamTextChunk<String>, Utf8StreamParserError> {
let mut all = StreamTextChunk::default();
for chunk in chunks {
let next = parser.push_bytes(chunk)?;
all.visible_text.push_str(&next.visible_text);
all.extracted.extend(next.extracted);
}
let tail = parser.finish()?;
all.visible_text.push_str(&tail.visible_text);
all.extracted.extend(tail.extracted);
Ok(all)
}
#[test]
fn utf8_stream_parser_handles_split_code_points_across_chunks() {
let chunks: [&[u8]; 3] = [
b"A\xC3",
b"\xA9<oai-mem-citation>\xE4",
b"\xB8\xAD</oai-mem-citation>Z",
];
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let out = match collect_bytes(&mut parser, &chunks) {
Ok(out) => out,
Err(err) => panic!("valid UTF-8 stream should parse: {err}"),
};
assert_eq!(out.visible_text, "AéZ");
assert_eq!(out.extracted, vec!["".to_string()]);
}
#[test]
fn utf8_stream_parser_rolls_back_on_invalid_utf8_chunk() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let first = match parser.push_bytes(&[0xC3]) {
Ok(out) => out,
Err(err) => panic!("leading byte may be buffered until next chunk: {err}"),
};
assert!(first.is_empty());
let err = match parser.push_bytes(&[0x28]) {
Ok(out) => panic!("invalid continuation byte should error, got output: {out:?}"),
Err(err) => err,
};
assert_eq!(
err,
Utf8StreamParserError::InvalidUtf8 {
valid_up_to: 0,
error_len: 1,
}
);
let second = match parser.push_bytes(&[0xA9, b'x']) {
Ok(out) => out,
Err(err) => panic!("state should still allow a valid continuation: {err}"),
};
let tail = match parser.finish() {
Ok(out) => out,
Err(err) => panic!("stream should finish: {err}"),
};
assert_eq!(second.visible_text, "éx");
assert!(second.extracted.is_empty());
assert!(tail.is_empty());
}
#[test]
fn utf8_stream_parser_rolls_back_entire_chunk_when_invalid_byte_follows_valid_prefix() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let err = match parser.push_bytes(b"ok\xFF") {
Ok(out) => panic!("invalid byte should error, got output: {out:?}"),
Err(err) => err,
};
assert_eq!(
err,
Utf8StreamParserError::InvalidUtf8 {
valid_up_to: 2,
error_len: 1,
}
);
let next = match parser.push_bytes(b"!") {
Ok(out) => out,
Err(err) => panic!("parser should recover after rollback: {err}"),
};
assert_eq!(next.visible_text, "!");
assert!(next.extracted.is_empty());
}
#[test]
fn utf8_stream_parser_errors_on_incomplete_code_point_at_eof() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let out = match parser.push_bytes(&[0xE2, 0x82]) {
Ok(out) => out,
Err(err) => panic!("partial code point should be buffered: {err}"),
};
assert!(out.is_empty());
let err = match parser.finish() {
Ok(out) => panic!("unfinished code point should error, got output: {out:?}"),
Err(err) => err,
};
assert_eq!(err, Utf8StreamParserError::IncompleteUtf8AtEof);
}
#[test]
fn utf8_stream_parser_into_inner_errors_when_partial_code_point_is_buffered() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let out = match parser.push_bytes(&[0xC3]) {
Ok(out) => out,
Err(err) => panic!("partial code point should be buffered: {err}"),
};
assert!(out.is_empty());
let err = match parser.into_inner() {
Ok(_) => panic!("buffered partial code point should be rejected"),
Err(err) => err,
};
assert_eq!(err, Utf8StreamParserError::IncompleteUtf8AtEof);
}
#[test]
fn utf8_stream_parser_into_inner_lossy_drops_buffered_partial_code_point() {
let mut parser = Utf8StreamParser::new(CitationStreamParser::new());
let out = match parser.push_bytes(&[0xC3]) {
Ok(out) => out,
Err(err) => panic!("partial code point should be buffered: {err}"),
};
assert!(out.is_empty());
let mut inner = parser.into_inner_lossy();
let tail = inner.finish();
assert!(tail.is_empty());
}
}