mirror of
https://github.com/openai/codex.git
synced 2026-04-24 22:54:54 +00:00
18 KiB
18 KiB
PR #1467: Fix Unicode handling in chat_composer "@" token detection
- URL: https://github.com/openai/codex/pull/1467
- Author: ryozi-tn
- Created: 2025-07-06 18:00:10 UTC
- Updated: 2025-07-07 20:43:39 UTC
- Changes: +175/-18, Files changed: 1, Commits: 4
Description
Issues Fixed
- Primary Issue (#1450): Unicode cursor positioning was incorrect due to mixing character positions with byte positions
- Additional Issue: Full-width spaces (CJK whitespace like " ") weren't properly handled as token boundaries
Full Diff
diff --git a/codex-rs/tui/src/bottom_pane/chat_composer.rs b/codex-rs/tui/src/bottom_pane/chat_composer.rs
index 59d6e4579d..cd8e9fa17f 100644
--- a/codex-rs/tui/src/bottom_pane/chat_composer.rs
+++ b/codex-rs/tui/src/bottom_pane/chat_composer.rs
@@ -290,26 +290,28 @@ impl ChatComposer<'_> {
// Guard against out-of-bounds rows.
let line = textarea.lines().get(row)?.as_str();
- // Clamp the cursor column to the line length to avoid slicing panics
- // when the cursor is at the end of the line.
- let col = col.min(line.len());
+ // Calculate byte offset for cursor position
+ let cursor_byte_offset = line.chars().take(col).map(|c| c.len_utf8()).sum::<usize>();
// Split the line at the cursor position so we can search for word
// boundaries on both sides.
- let before_cursor = &line[..col];
- let after_cursor = &line[col..];
+ let before_cursor = &line[..cursor_byte_offset];
+ let after_cursor = &line[cursor_byte_offset..];
- // Find start index (first character **after** the previous whitespace).
+ // Find start index (first character **after** the previous multi-byte whitespace).
let start_idx = before_cursor
- .rfind(|c: char| c.is_whitespace())
- .map(|idx| idx + 1)
+ .char_indices()
+ .rfind(|(_, c)| c.is_whitespace())
+ .map(|(idx, c)| idx + c.len_utf8())
.unwrap_or(0);
- // Find end index (first whitespace **after** the cursor position).
+ // Find end index (first multi-byte whitespace **after** the cursor position).
let end_rel_idx = after_cursor
- .find(|c: char| c.is_whitespace())
+ .char_indices()
+ .find(|(_, c)| c.is_whitespace())
+ .map(|(idx, _)| idx)
.unwrap_or(after_cursor.len());
- let end_idx = col + end_rel_idx;
+ let end_idx = cursor_byte_offset + end_rel_idx;
if start_idx >= end_idx {
return None;
@@ -336,21 +338,25 @@ impl ChatComposer<'_> {
let mut lines: Vec<String> = self.textarea.lines().to_vec();
if let Some(line) = lines.get_mut(row) {
- let col = col.min(line.len());
+ // Calculate byte offset for cursor position
+ let cursor_byte_offset = line.chars().take(col).map(|c| c.len_utf8()).sum::<usize>();
- let before_cursor = &line[..col];
- let after_cursor = &line[col..];
+ let before_cursor = &line[..cursor_byte_offset];
+ let after_cursor = &line[cursor_byte_offset..];
// Determine token boundaries.
let start_idx = before_cursor
- .rfind(|c: char| c.is_whitespace())
- .map(|idx| idx + 1)
+ .char_indices()
+ .rfind(|(_, c)| c.is_whitespace())
+ .map(|(idx, c)| idx + c.len_utf8())
.unwrap_or(0);
let end_rel_idx = after_cursor
- .find(|c: char| c.is_whitespace())
+ .char_indices()
+ .find(|(_, c)| c.is_whitespace())
+ .map(|(idx, _)| idx)
.unwrap_or(after_cursor.len());
- let end_idx = col + end_rel_idx;
+ let end_idx = cursor_byte_offset + end_rel_idx;
// Replace the slice `[start_idx, end_idx)` with the chosen path and a trailing space.
let mut new_line =
@@ -618,3 +624,154 @@ impl WidgetRef for &ChatComposer<'_> {
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::bottom_pane::ChatComposer;
+ use tui_textarea::TextArea;
+
+ #[test]
+ fn test_current_at_token_basic_cases() {
+ let test_cases = vec![
+ // Valid @ tokens
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
+ (
+ "@file.txt",
+ 4,
+ Some("file.txt".to_string()),
+ "ASCII with extension",
+ ),
+ (
+ "hello @world test",
+ 8,
+ Some("world".to_string()),
+ "ASCII token in middle",
+ ),
+ (
+ "@test123",
+ 5,
+ Some("test123".to_string()),
+ "ASCII with numbers",
+ ),
+ // Unicode examples
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
+ (
+ "@testЙЦУ.rs",
+ 8,
+ Some("testЙЦУ.rs".to_string()),
+ "Mixed ASCII and Cyrillic",
+ ),
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
+ // Invalid cases (should return None)
+ ("hello", 2, None, "No @ symbol"),
+ ("@", 1, None, "Only @ symbol"),
+ ("@ hello", 2, None, "@ followed by space"),
+ ("test @ world", 6, None, "@ with spaces around"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_cursor_positions() {
+ let test_cases = vec![
+ // Different cursor positions within a token
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
+ // Multiple tokens - cursor determines which token
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
+ (
+ "@file1 @file2",
+ 8,
+ Some("file2".to_string()),
+ "Second token",
+ ),
+ // Edge cases
+ ("@", 0, None, "Only @ symbol"),
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
+ ("", 0, None, "Empty input"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for cursor position case: {description} - input: '{input}', cursor: {cursor_pos}",
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_whitespace_boundaries() {
+ let test_cases = vec![
+ // Space boundaries
+ (
+ "aaa@aaa",
+ 4,
+ None,
+ "Connected @ token - no completion by design",
+ ),
+ (
+ "aaa @aaa",
+ 5,
+ Some("aaa".to_string()),
+ "@ token after space",
+ ),
+ (
+ "test @file.txt",
+ 7,
+ Some("file.txt".to_string()),
+ "@ token after space",
+ ),
+ // Full-width space boundaries
+ (
+ "test @İstanbul",
+ 6,
+ Some("İstanbul".to_string()),
+ "@ token after full-width space",
+ ),
+ (
+ "@ЙЦУ @诶",
+ 6,
+ Some("诶".to_string()),
+ "Full-width space between Unicode tokens",
+ ),
+ // Tab and newline boundaries
+ (
+ "test\t@file",
+ 6,
+ Some("file".to_string()),
+ "@ token after tab",
+ ),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for whitespace boundary case: {description} - input: '{input}', cursor: {cursor_pos}",
+ );
+ }
+ }
+}
Review Comments
codex-rs/tui/src/bottom_pane/chat_composer.rs
- Created: 2025-07-07 20:31:18 UTC | Link: https://github.com/openai/codex/pull/1467#discussion_r2190962611
@@ -618,3 +624,107 @@ impl WidgetRef for &ChatComposer<'_> {
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::bottom_pane::ChatComposer;
+ use tui_textarea::TextArea;
+
+ #[test]
+ fn test_current_at_token_basic_cases() {
+ let test_cases = vec![
+ // Valid @ tokens
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
+ ("@file.txt", 4, Some("file.txt".to_string()), "ASCII with extension"),
+ ("hello @world test", 8, Some("world".to_string()), "ASCII token in middle"),
+ ("@test123", 5, Some("test123".to_string()), "ASCII with numbers"),
+
+ // Unicode examples
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
+ ("@testЙЦУ.rs", 8, Some("testЙЦУ.rs".to_string()), "Mixed ASCII and Cyrillic"),
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
+
+ // Invalid cases (should return None)
+ ("hello", 2, None, "No @ symbol"),
+ ("@", 1, None, "Only @ symbol"),
+ ("@ hello", 2, None, "@ followed by space"),
+ ("test @ world", 6, None, "@ with spaces around"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_cursor_positions() {
+ let test_cases = vec![
+ // Different cursor positions within a token
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
+
+ // Multiple tokens - cursor determines which token
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
+ ("@file1 @file2", 8, Some("file2".to_string()), "Second token"),
+
+ // Edge cases
+ ("@", 0, None, "Only @ symbol"),
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
+ ("", 0, None, "Empty input"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for cursor position case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_whitespace_boundaries() {
+ let test_cases = vec![
+ // Space boundaries
+ ("aaa@aaa", 4, None, "Connected @ token - no completion by design"),
+ ("aaa @aaa", 5, Some("aaa".to_string()), "@ token after space"),
+ ("test @file.txt", 7, Some("file.txt".to_string()), "@ token after space"),
+
+ // Full-width space boundaries
+ ("test @İstanbul", 6, Some("İstanbul".to_string()), "@ token after full-width space"),
+ ("@ЙЦУ @诶", 6, Some("诶".to_string()), "Full-width space between Unicode tokens"),
+
+ // Tab and newline boundaries
+ ("test\t@file", 6, Some("file".to_string()), "@ token after tab"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for whitespace boundary case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
assert_eq!( result, expected, "Failed for whitespace boundary case: {description} - input: '{input}', cursor: {cursor_pos}", );
- Created: 2025-07-07 20:31:43 UTC | Link: https://github.com/openai/codex/pull/1467#discussion_r2190963121
@@ -618,3 +624,107 @@ impl WidgetRef for &ChatComposer<'_> {
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::bottom_pane::ChatComposer;
+ use tui_textarea::TextArea;
+
+ #[test]
+ fn test_current_at_token_basic_cases() {
+ let test_cases = vec![
+ // Valid @ tokens
+ ("@hello", 3, Some("hello".to_string()), "Basic ASCII token"),
+ ("@file.txt", 4, Some("file.txt".to_string()), "ASCII with extension"),
+ ("hello @world test", 8, Some("world".to_string()), "ASCII token in middle"),
+ ("@test123", 5, Some("test123".to_string()), "ASCII with numbers"),
+
+ // Unicode examples
+ ("@İstanbul", 3, Some("İstanbul".to_string()), "Turkish text"),
+ ("@testЙЦУ.rs", 8, Some("testЙЦУ.rs".to_string()), "Mixed ASCII and Cyrillic"),
+ ("@诶", 2, Some("诶".to_string()), "Chinese character"),
+ ("@👍", 2, Some("👍".to_string()), "Emoji token"),
+
+ // Invalid cases (should return None)
+ ("hello", 2, None, "No @ symbol"),
+ ("@", 1, None, "Only @ symbol"),
+ ("@ hello", 2, None, "@ followed by space"),
+ ("test @ world", 6, None, "@ with spaces around"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
+ }
+ }
+
+ #[test]
+ fn test_current_at_token_cursor_positions() {
+ let test_cases = vec![
+ // Different cursor positions within a token
+ ("@test", 0, Some("test".to_string()), "Cursor at @"),
+ ("@test", 1, Some("test".to_string()), "Cursor after @"),
+ ("@test", 5, Some("test".to_string()), "Cursor at end"),
+
+ // Multiple tokens - cursor determines which token
+ ("@file1 @file2", 0, Some("file1".to_string()), "First token"),
+ ("@file1 @file2", 8, Some("file2".to_string()), "Second token"),
+
+ // Edge cases
+ ("@", 0, None, "Only @ symbol"),
+ ("@a", 2, Some("a".to_string()), "Single character after @"),
+ ("", 0, None, "Empty input"),
+ ];
+
+ for (input, cursor_pos, expected, description) in test_cases {
+ let mut textarea = TextArea::default();
+ textarea.insert_str(input);
+ textarea.move_cursor(tui_textarea::CursorMove::Jump(0, cursor_pos));
+
+ let result = ChatComposer::current_at_token(&textarea);
+ assert_eq!(
+ result, expected,
+ "Failed for cursor position case: {} - input: '{}', cursor: {}",
+ description, input, cursor_pos
+ );
assert_eq!( result, expected, "Failed for cursor position case: {description} - input: '{input}', cursor: {cursor_pos}", );