diff --git a/source/air/src/def.rs b/source/air/src/def.rs index fac4314b1c..702e0d82b9 100644 --- a/source/air/src/def.rs +++ b/source/air/src/def.rs @@ -12,6 +12,7 @@ pub const CHOOSE: &str = "%%choose%%"; pub const HOLE: &str = "%%hole%%"; pub const APPLY: &str = "%%apply%%"; pub const TEMP: &str = "%%x%%"; +pub const MANGLED_SYMBOL_PREFIX: &str = "%%verus_mangled%%"; pub const SKOLEM_ID_PREFIX: &str = "skolem"; pub const ARRAY_QID: &str = "__AIR_ARRAY_QID__"; diff --git a/source/air/src/parser.rs b/source/air/src/parser.rs index 671d52c91a..34af6e9815 100644 --- a/source/air/src/parser.rs +++ b/source/air/src/parser.rs @@ -11,13 +11,9 @@ use sise::Node; use std::io::Write; use std::sync::Arc; -// Following SMT-LIB syntax specification -fn is_symbol_char(c: char) -> bool { - c.is_ascii_alphanumeric() || "~!@$%^&*_-+=<>.?/".contains(c) -} - +// AIR symbols may contain Unicode or other non-SMT characters; the printer will mangle them before sending to the solver. fn is_symbol(s: &String) -> bool { - s.len() > 0 && s.chars().all(is_symbol_char) + s.len() > 0 } fn is_bitvec(nodes: &Vec) -> Option { diff --git a/source/air/src/printer.rs b/source/air/src/printer.rs index cc88fc06ca..fa18d9393f 100644 --- a/source/air/src/printer.rs +++ b/source/air/src/printer.rs @@ -4,11 +4,78 @@ use crate::ast::{ Typs, UnaryOp, }; use crate::context::SmtSolver; -use crate::def::mk_skolem_id; +use crate::def::{MANGLED_SYMBOL_PREFIX, mk_skolem_id}; use crate::util::vec_map; use sise::{Node, Writer}; use std::sync::Arc; +fn is_smt_simple_symbol_char(c: char) -> bool { + c.is_ascii_alphanumeric() + || matches!( + c, + '~' | '!' + | '@' + | '$' + | '%' + | '^' + | '&' + | '*' + | '_' + | '-' + | '+' + | '=' + | '<' + | '>' + | '.' + | '?' + | '/' + ) +} + +fn is_valid_smt_atom(s: &str) -> bool { + if let Some(rest) = s.strip_prefix(':') { + if !rest.is_empty() && rest.chars().all(is_smt_simple_symbol_char) { + return true; + } + } + if s.bytes().all(|b| b.is_ascii_digit()) { + return true; + } + if let Some((lhs, rhs)) = s.split_once('.') { + if !lhs.is_empty() + && !rhs.is_empty() + && lhs.bytes().all(|b| b.is_ascii_digit()) + && rhs.bytes().all(|b| b.is_ascii_digit()) + { + return true; + } + } + let Some(first) = s.chars().next() else { + return false; + }; + !first.is_ascii_digit() + && is_smt_simple_symbol_char(first) + && s.chars().skip(1).all(is_smt_simple_symbol_char) +} + +fn mangle_smt_atom(s: &str) -> String { + use std::fmt::Write as _; + let mut out = String::from(MANGLED_SYMBOL_PREFIX); + if s.is_empty() { + out.push('_'); + return out; + } + for ch in s.chars() { + write!(&mut out, "_{:x}", ch as u32).expect("Writing to String should not fail"); + } + out +} + +pub fn sanitize_ident(s: &str) -> Ident { + Arc::new(if is_valid_smt_atom(s) { s.to_string() } else { mangle_smt_atom(s) }) +} + +#[inline] pub fn str_to_node(s: &str) -> Node { Node::Atom(s.to_string()) } @@ -589,7 +656,9 @@ impl NodeWriter { sise::SpacedStringWriterNodeOptions { break_line_len: if brk { 0 } else { break_len } }; match node { Node::Atom(a) => { - writer.write_atom(a, opts).unwrap(); + let a = if is_valid_smt_atom(a) { a.clone() } else { mangle_smt_atom(a) }; + assert!(sise::check_atom(&a), "invalid atom: {}", a); + writer.write_atom(&a, opts).unwrap(); } Node::List(l) => { writer.begin_list(opts).unwrap(); diff --git a/source/air/src/tests.rs b/source/air/src/tests.rs index b94cfacaff..99ee3cf5e8 100644 --- a/source/air/src/tests.rs +++ b/source/air/src/tests.rs @@ -4,10 +4,20 @@ use crate::messages::Reporter; #[allow(unused_imports)] use crate::parser::Parser; #[allow(unused_imports)] -use crate::printer::{macro_push_node, str_to_node}; +use crate::printer::{NodeWriter, macro_push_node, str_to_node}; #[allow(unused_imports)] use sise::Node; +#[test] +fn node_writer_mangles_invalid_atoms() { + for atom in ["r#for$", "r#in$", "r#bad"] { + let mut nw = NodeWriter::new(); + let s = nw.node_to_string_indent(&String::new(), &Node::Atom(atom.to_string())); + assert!(s.contains(crate::def::MANGLED_SYMBOL_PREFIX)); + assert!(!s.contains('#')); + } +} + #[allow(dead_code)] fn run_nodes_as_test(should_typecheck: bool, should_be_valid: bool, nodes: &[Node]) { let message_interface = std::sync::Arc::new(crate::messages::AirMessageInterface {}); diff --git a/source/rust_verify_test/tests/regression.rs b/source/rust_verify_test/tests/regression.rs index 5dc17781ad..965aeffb1e 100644 --- a/source/rust_verify_test/tests/regression.rs +++ b/source/rust_verify_test/tests/regression.rs @@ -1,4 +1,5 @@ #![feature(rustc_private)] +#![allow(mixed_script_confusables)] #[macro_use] mod common; use common::*; @@ -1587,3 +1588,132 @@ test_verify_one_file! { } } => Ok(()) } + +test_verify_one_file! { + #[test] raw_identifiers_in_smt_2221 verus_code! { + // Raw identifiers (r#keyword) must be sanitized before reaching SMT-LIB, + // since '#' is not a valid SMT-LIB simple-symbol character. + // Test a representative set of Rust strict keywords as function names, + // parameters, local variables, struct fields, and enum variants. + + struct r#struct { + r#type: u32, + r#loop: u32, + } + + enum r#enum { + r#match, + r#impl, + } + + fn r#match( + r#in: u32, + r#out: u32, + r#for: u32, + r#while: u32, + r#return: u32, + r#break: u32, + r#continue: u32, + r#if: u32, + r#else: u32, + r#let: u32, + r#mut: u32, + r#ref: u32, + r#move: u32, + r#const: u32, + r#static: u32, + r#unsafe: u32, + r#where: u32, + r#async: u32, + r#await: u32, + r#dyn: u32, + r#as: u32, + r#use: u32, + r#pub: u32, + r#mod: u32, + ) -> u32 + requires r#in == r#out + { + let r#type = r#in; + let r#loop = r#for; + let r#match = r#while; + let r#impl = r#return; + let r#enum = r#break; + let r#struct = r#continue; + let r#trait = r#if; + let r#let = r#else; + let r#mut = r#let; + let r#ref = r#mut; + let r#move = r#ref; + let r#const = r#move; + let r#static = r#const; + let r#unsafe = r#static; + let r#where = r#unsafe; + let r#async = r#where; + let r#await = r#async; + let r#dyn = r#await; + let r#as = r#dyn; + let r#use = r#as; + let r#pub = r#use; + let r#mod = r#pub; + assert(r#type == r#out); + r#type + } + + fn raw_param(r#for: u32) -> u32 { r#for } + } => Ok(()) +} + +test_verify_one_file! { + #[test] unicode_identifiers_in_smt_2221 verus_code! { + // Unicode identifiers must be sanitized before reaching AIR/SMT-LIB, + // since non-ASCII characters are not valid in SMT-LIB simple symbols. + // Test unicode in types, fields, enums, traits, generics, functions, + // parameters, locals, constants, and spec/proof functions. + + struct Wrapperα { + payloadγ: Tβ, + extraδ: u32, + } + + enum Choiceε { + OptionAη, + OptionBθ(Tζ), + } + + trait Traitι { + spec fn methodκ(&self) -> int; + } + + impl Traitι for Wrapperα { + spec fn methodκ(&self) -> int { self.payloadγ as int + self.extraδ as int } + } + + spec fn αβγ(x: u32) -> u32 { x } + + const MAXλ: u32 = 42; + + proof fn test_unicode(item: Wrapperα, choice: Choiceε) + requires item.payloadγ == MAXλ + { + let baseν = item.payloadγ; + let scaledξ = αβγ(1u32); + let valueο = MAXλ; + assert(baseν == MAXλ); + assert(scaledξ == 1u32); + assert(valueο == 42); + assert(item.methodκ() == MAXλ as int + item.extraδ as int); + } + } => Ok(_err) => { + // Verification succeeds; the only warnings are the mixed_script_confusables + // lint about using non-ASCII identifiers, which is a Rust lint unrelated to AIR sanitization. + assert!(_err.errors.is_empty(), "unexpected errors: {:?}", _err.errors); + for w in &_err.warnings { + let msg = &w.message; + assert!( + msg.contains("mixed script confusables") || msg.contains("warning emitted"), + "unexpected warning: {:?}", msg + ); + } + } +} diff --git a/source/vir/src/def.rs b/source/vir/src/def.rs index 2d26041d01..fc281774ad 100644 --- a/source/vir/src/def.rs +++ b/source/vir/src/def.rs @@ -681,7 +681,12 @@ pub fn encode_dt_as_path(dt: &Dt) -> Path { pub fn variant_ident(dt: &Dt, variant: &str) -> Ident { let path = encode_dt_as_path(dt); - Arc::new(format!("{}{}{}", path_to_string(&path), VARIANT_SEPARATOR, variant)) + air::printer::sanitize_ident(&format!( + "{}{}{}", + path_to_string(&path), + VARIANT_SEPARATOR, + variant + )) } pub fn is_variant_ident(datatype: &Dt, variant: &str) -> Ident { @@ -694,7 +699,7 @@ pub fn variant_field_ident_internal( field: &Ident, internal: bool, ) -> Ident { - Arc::new(format!( + air::printer::sanitize_ident(&format!( "{}{}{}{}{}", path_to_string(path), VARIANT_SEPARATOR,