mirror of
https://github.com/openai/codex.git
synced 2026-05-01 09:56:37 +00:00
Extract command safety crate
This commit is contained in:
207
codex-rs/command-safety/src/bash_parse.rs
Normal file
207
codex-rs/command-safety/src/bash_parse.rs
Normal file
@@ -0,0 +1,207 @@
|
||||
use std::path::Path;
|
||||
|
||||
use tree_sitter::Node;
|
||||
use tree_sitter::Parser;
|
||||
use tree_sitter::Tree;
|
||||
use tree_sitter_bash::LANGUAGE as BASH;
|
||||
|
||||
/// Parse the provided bash source using tree-sitter-bash, returning a Tree on
|
||||
/// success or None if parsing failed.
|
||||
pub fn try_parse_shell(shell_lc_arg: &str) -> Option<Tree> {
|
||||
let lang = BASH.into();
|
||||
let mut parser = Parser::new();
|
||||
#[expect(clippy::expect_used)]
|
||||
parser.set_language(&lang).expect("load bash grammar");
|
||||
let old_tree: Option<&Tree> = None;
|
||||
parser.parse(shell_lc_arg, old_tree)
|
||||
}
|
||||
|
||||
/// Parse a script which may contain multiple simple commands joined only by
|
||||
/// the safe logical/pipe/sequencing operators: `&&`, `||`, `;`, `|`.
|
||||
///
|
||||
/// Returns `Some(Vec<command_words>)` if every command is a plain word‑only
|
||||
/// command and the parse tree does not contain disallowed constructs
|
||||
/// (parentheses, redirections, substitutions, control flow, etc.). Otherwise
|
||||
/// returns `None`.
|
||||
pub fn try_parse_word_only_commands_sequence(tree: &Tree, src: &str) -> Option<Vec<Vec<String>>> {
|
||||
if tree.root_node().has_error() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// List of allowed (named) node kinds for a "word only commands sequence".
|
||||
// If we encounter a named node that is not in this list we reject.
|
||||
const ALLOWED_KINDS: &[&str] = &[
|
||||
// top level containers
|
||||
"program",
|
||||
"list",
|
||||
"pipeline",
|
||||
// commands & words
|
||||
"command",
|
||||
"command_name",
|
||||
"word",
|
||||
"string",
|
||||
"string_content",
|
||||
"raw_string",
|
||||
"number",
|
||||
"concatenation",
|
||||
];
|
||||
// Allow only safe punctuation / operator tokens; anything else causes reject.
|
||||
const ALLOWED_PUNCT_TOKENS: &[&str] = &["&&", "||", ";", "|", "\"", "'"];
|
||||
|
||||
let root = tree.root_node();
|
||||
let mut cursor = root.walk();
|
||||
let mut stack = vec![root];
|
||||
let mut command_nodes = Vec::new();
|
||||
while let Some(node) = stack.pop() {
|
||||
let kind = node.kind();
|
||||
if node.is_named() {
|
||||
if !ALLOWED_KINDS.contains(&kind) {
|
||||
return None;
|
||||
}
|
||||
if kind == "command" {
|
||||
command_nodes.push(node);
|
||||
}
|
||||
} else {
|
||||
// Reject any punctuation / operator tokens that are not explicitly allowed.
|
||||
if kind.chars().any(|c| "&;|".contains(c)) && !ALLOWED_PUNCT_TOKENS.contains(&kind) {
|
||||
return None;
|
||||
}
|
||||
if !(ALLOWED_PUNCT_TOKENS.contains(&kind) || kind.trim().is_empty()) {
|
||||
// If it's a quote token or operator it's allowed above; we also allow whitespace tokens.
|
||||
// Any other punctuation like parentheses, braces, redirects, backticks, etc are rejected.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
for child in node.children(&mut cursor) {
|
||||
stack.push(child);
|
||||
}
|
||||
}
|
||||
|
||||
// Walk uses a stack (LIFO), so re-sort by position to restore source order.
|
||||
command_nodes.sort_by_key(Node::start_byte);
|
||||
|
||||
let mut commands = Vec::new();
|
||||
for node in command_nodes {
|
||||
if let Some(words) = parse_plain_command_from_node(node, src) {
|
||||
commands.push(words);
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
Some(commands)
|
||||
}
|
||||
|
||||
pub fn extract_bash_command(command: &[String]) -> Option<(&str, &str)> {
|
||||
let [shell, flag, script] = command else {
|
||||
return None;
|
||||
};
|
||||
if !matches!(flag.as_str(), "-lc" | "-c") || !is_supported_posix_shell(shell) {
|
||||
return None;
|
||||
}
|
||||
Some((shell, script))
|
||||
}
|
||||
|
||||
fn is_supported_posix_shell(shell: &str) -> bool {
|
||||
Path::new(shell)
|
||||
.file_stem()
|
||||
.and_then(|stem| stem.to_str())
|
||||
.is_some_and(|name| matches!(name, "bash" | "zsh" | "sh"))
|
||||
}
|
||||
|
||||
/// Returns the sequence of plain commands within a `bash -lc "..."` or
|
||||
/// `zsh -lc "..."` invocation when the script only contains word-only commands
|
||||
/// joined by safe operators.
|
||||
pub fn parse_shell_lc_plain_commands(command: &[String]) -> Option<Vec<Vec<String>>> {
|
||||
let (_, script) = extract_bash_command(command)?;
|
||||
|
||||
let tree = try_parse_shell(script)?;
|
||||
try_parse_word_only_commands_sequence(&tree, script)
|
||||
}
|
||||
|
||||
fn parse_plain_command_from_node(cmd: tree_sitter::Node, src: &str) -> Option<Vec<String>> {
|
||||
if cmd.kind() != "command" {
|
||||
return None;
|
||||
}
|
||||
let mut words = Vec::new();
|
||||
let mut cursor = cmd.walk();
|
||||
for child in cmd.named_children(&mut cursor) {
|
||||
match child.kind() {
|
||||
"command_name" => {
|
||||
let word_node = child.named_child(0)?;
|
||||
if word_node.kind() != "word" {
|
||||
return None;
|
||||
}
|
||||
words.push(word_node.utf8_text(src.as_bytes()).ok()?.to_owned());
|
||||
}
|
||||
"word" | "number" => {
|
||||
words.push(child.utf8_text(src.as_bytes()).ok()?.to_owned());
|
||||
}
|
||||
"string" => {
|
||||
let parsed = parse_double_quoted_string(child, src)?;
|
||||
words.push(parsed);
|
||||
}
|
||||
"raw_string" => {
|
||||
let parsed = parse_raw_string(child, src)?;
|
||||
words.push(parsed);
|
||||
}
|
||||
"concatenation" => {
|
||||
// Handle concatenated arguments like -g"*.py"
|
||||
let mut concatenated = String::new();
|
||||
let mut concat_cursor = child.walk();
|
||||
for part in child.named_children(&mut concat_cursor) {
|
||||
match part.kind() {
|
||||
"word" | "number" => {
|
||||
concatenated
|
||||
.push_str(part.utf8_text(src.as_bytes()).ok()?.to_owned().as_str());
|
||||
}
|
||||
"string" => {
|
||||
let parsed = parse_double_quoted_string(part, src)?;
|
||||
concatenated.push_str(&parsed);
|
||||
}
|
||||
"raw_string" => {
|
||||
let parsed = parse_raw_string(part, src)?;
|
||||
concatenated.push_str(&parsed);
|
||||
}
|
||||
_ => return None,
|
||||
}
|
||||
}
|
||||
if concatenated.is_empty() {
|
||||
return None;
|
||||
}
|
||||
words.push(concatenated);
|
||||
}
|
||||
_ => return None,
|
||||
}
|
||||
}
|
||||
Some(words)
|
||||
}
|
||||
|
||||
fn parse_double_quoted_string(node: Node, src: &str) -> Option<String> {
|
||||
if node.kind() != "string" {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut cursor = node.walk();
|
||||
for part in node.named_children(&mut cursor) {
|
||||
if part.kind() != "string_content" {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
let raw = node.utf8_text(src.as_bytes()).ok()?;
|
||||
let stripped = raw
|
||||
.strip_prefix('"')
|
||||
.and_then(|text| text.strip_suffix('"'))?;
|
||||
Some(stripped.to_string())
|
||||
}
|
||||
|
||||
fn parse_raw_string(node: Node, src: &str) -> Option<String> {
|
||||
if node.kind() != "raw_string" {
|
||||
return None;
|
||||
}
|
||||
|
||||
let raw_string = node.utf8_text(src.as_bytes()).ok()?;
|
||||
let stripped = raw_string
|
||||
.strip_prefix('\'')
|
||||
.and_then(|s| s.strip_suffix('\''));
|
||||
stripped.map(str::to_owned)
|
||||
}
|
||||
Reference in New Issue
Block a user