From 9ef8d571ed34438b6c19b9ffea97e2ca34b7fbba Mon Sep 17 00:00:00 2001 From: mrodz Date: Mon, 19 Feb 2024 15:33:23 -0800 Subject: [PATCH] fix edge cases with character escaping --- bytecode/src/file.rs | 2 +- bytecode/src/instruction.rs | 33 ++++++++++--- bytecode/src/lib.rs | 2 +- bytecode/src/variables/ops/mul.rs | 2 +- bytecode_dev_transpiler/src/lib.rs | 3 +- compiler/src/ast.rs | 58 +++++++++++----------- compiler/src/ast/string.rs | 9 ++-- compiler/src/tests/types.rs | 78 +++++++++++++++++++++++++++--- 8 files changed, 136 insertions(+), 51 deletions(-) diff --git a/bytecode/src/file.rs b/bytecode/src/file.rs index 9e9d0f4..4bcdad0 100644 --- a/bytecode/src/file.rs +++ b/bytecode/src/file.rs @@ -256,7 +256,7 @@ impl MScriptFile { // instruction (w. Args) syntax: `{id} arg1 arg2 "multi arg" arg4\0` // where {id}: u8 [instruction, b' ', args @ .., 0x00] if in_function => { - let args = split_string(String::from_utf8_lossy(args))?; + let args = split_string(String::from_utf8_lossy(args).as_ref())?; let instruction = Instruction::new(*instruction, args); diff --git a/bytecode/src/instruction.rs b/bytecode/src/instruction.rs index 5febdd6..050c496 100644 --- a/bytecode/src/instruction.rs +++ b/bytecode/src/instruction.rs @@ -1,4 +1,4 @@ -//! All of the implementations for each bytecode instruction are found here. +//! All the implementations for each bytecode instruction are found here. use super::context::Ctx; use super::function::InstructionExitState; @@ -505,7 +505,7 @@ pub mod implementations { let raw_str = match args.len() { 0 => "", 1 => &args[0], - _ => bail!("make_str requires 0 arguments (empty string) or one argument (the string)"), + _ => bail!("make_str requires 0 arguments (empty string) or one argument (the string), but got {args:?}"), }; let var = crate::string!(raw raw_str); @@ -765,7 +765,7 @@ pub mod implementations { } ctx.pop_frame(); - return Ok(()) + return Ok(()); } _ => bail!("missing argument, and the last item in the local stack ({last:?}, S:{:#?}) is not a function.", ctx.get_local_operating_stack()), } @@ -1483,7 +1483,13 @@ pub mod implementations { /// println!("{combined:?}"); // Ok(["Hello World"]) /// /// ``` -pub fn split_string(string: Cow) -> Result> { +pub fn split_string(string: &str) -> Result> { + split_string_v2(string, true) +} + +/// Choose whether the output is one string, or if spaces/quotes +/// are delimiters for many outputs. +pub fn split_string_v2(string: &str, multi_target: bool) -> Result> { let mut result = Vec::::new(); let mut buf = String::new(); let mut in_quotes = false; @@ -1491,9 +1497,13 @@ pub fn split_string(string: Cow) -> Result> { for char in string.chars() { if !in_quotes && (char.is_whitespace()) { - if !buf.is_empty() { - result.push(buf.to_string()); - buf.clear(); + if multi_target { + if !buf.is_empty() { + result.push(buf.to_string()); + buf.clear(); + } + } else { + buf.push(char); } continue; } @@ -1514,7 +1524,7 @@ pub fn split_string(string: Cow) -> Result> { continue; } - if in_quotes { + if multi_target && in_quotes { result.push(buf.to_string()); buf.clear(); } @@ -1551,8 +1561,15 @@ pub fn split_string(string: Cow) -> Result> { bail!("found EOL while parsing string: `{string}`") } else { if !buf.is_empty() { + // flush buffer at the end of processing result.push(buf.to_string()); } + + if result.is_empty() { + // edge case + result.push(String::new()); + } + Ok(result.into_boxed_slice()) } } diff --git a/bytecode/src/lib.rs b/bytecode/src/lib.rs index abc9e47..9aeb9fe 100644 --- a/bytecode/src/lib.rs +++ b/bytecode/src/lib.rs @@ -25,7 +25,7 @@ pub mod compilation_bridge { use crate::instruction_constants::{BIN_TO_REPR, REPR_TO_BIN}; use std::borrow::Cow; - pub use crate::instruction::split_string; + pub use crate::instruction::{split_string, split_string_v2}; pub use crate::file::{MScriptFile, MScriptFileBuilder}; pub use crate::instruction::Instruction; diff --git a/bytecode/src/variables/ops/mul.rs b/bytecode/src/variables/ops/mul.rs index 8ce391e..3681d40 100644 --- a/bytecode/src/variables/ops/mul.rs +++ b/bytecode/src/variables/ops/mul.rs @@ -22,7 +22,7 @@ impl std::ops::Mul for &Primitive { } /// we cannot use built-in repeat, since Primitive does not implement Copy. - fn repeat_vec(original: &Vec, new_size: usize) -> Result> { + fn repeat_vec(original: &[Primitive], new_size: usize) -> Result> { let len = original.len(); let Some(new_size) = len.checked_mul(new_size) else { bail!("new size is too large") diff --git a/bytecode_dev_transpiler/src/lib.rs b/bytecode_dev_transpiler/src/lib.rs index 87c60ed..c295c7e 100644 --- a/bytecode_dev_transpiler/src/lib.rs +++ b/bytecode_dev_transpiler/src/lib.rs @@ -1,7 +1,6 @@ use anyhow::{bail, Result}; use bytecode::compilation_bridge::*; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; -use std::borrow::Cow; use std::fs::File; use std::io::{BufRead, BufReader, Seek, Write}; use std::path::Path; @@ -154,7 +153,7 @@ pub fn transpile_file(path: &str, new_path: &str) -> Result<()> { pb.tick(); } - let arguments = split_string(Cow::Borrowed(args))?; + let arguments = split_string(args)?; let name = name.trim_start(); diff --git a/compiler/src/ast.rs b/compiler/src/ast.rs index d318326..25ed13b 100644 --- a/compiler/src/ast.rs +++ b/compiler/src/ast.rs @@ -243,25 +243,26 @@ impl CompiledItem { } pub fn repr(&self, use_string_version: bool) -> Result { - fn fix_arg_if_needed(arg: &String) -> Result> { - let starts = arg.starts_with('\"'); - let ends = arg.ends_with('\"'); - - if starts ^ ends { - bail!("non-matching `\"` on arg {arg}") - } - - let has_quotes = starts && ends; - - if !has_quotes && arg.contains(' ') { - let mut combined = String::with_capacity(arg.len() + 2); - combined.push('"'); - combined.push_str(arg); - combined.push('"'); - Ok(Cow::Owned(combined)) - } else { - Ok(Cow::Borrowed(arg)) - } + fn fix_arg_if_needed(arg: &str) -> Result> { + Ok(Cow::Owned("\"".to_owned() + arg + "\"")) + // let starts = arg.starts_with('"'); + // let ends = arg.ends_with('"') && !arg.ends_with(r#"\""#); + // + // if starts ^ ends { + // bail!("non-matching `\"` on arg {arg} ({starts} & {ends})") + // } + // + // let has_quotes = starts && ends; + // + // if !has_quotes && arg.contains(' ') { + // let mut combined = String::with_capacity(arg.len() + 2); + // combined.push('"'); + // combined.push_str(arg); + // combined.push('"'); + // Ok(Cow::Owned(combined)) + // } else { + // Ok(Cow::Borrowed(arg)) + // } } match self { @@ -291,8 +292,9 @@ impl CompiledItem { if arguments.len() >= 1 { for arg in &arguments[..] { args.push(' '); - let arg = fix_arg_if_needed(arg).unwrap(); - args.push_str(arg.as_str()); + let replaced = arg.replace('"', "\\\""); + let arg = fix_arg_if_needed(&replaced)?; + args.push_str(arg.as_ref()); } } @@ -384,7 +386,7 @@ impl CompilationState { &self, file_manager: &FileManager, driver: &mut impl FnMut(&File, Vec) -> Result<()>, - ) -> Result<(), Vec> { + ) -> Result<(), Vec> { { let completed = file_manager.completed_ast.borrow(); log::debug!( @@ -401,7 +403,7 @@ impl CompilationState { file_manager: &FileManager, driver: &mut impl FnMut(&File, Vec) -> Result<()>, depth: usize, - ) -> Result<(), Vec> { + ) -> Result<(), Vec> { let mut view = self.compilation_queue.borrow_mut(); let mut node @ Some(..) = view.take() else { @@ -452,7 +454,7 @@ impl CompilationState { pub fn free_loop_register(&self, register: NumberLoopRegister) { if let NumberLoopRegister::Generated(id) = register { - assert!(self.loop_register_c.get() == id); + assert_eq!(self.loop_register_c.get(), id); self.loop_register_c.set(id - 1); } @@ -503,7 +505,7 @@ impl CompilationState { } } -pub(crate) trait Compile { +pub(crate) trait Compile { fn compile(&self, state: &CompilationState) -> Result, E>; } @@ -580,10 +582,10 @@ impl<'a> From<&'a Ident> for Dependency<'a> { /// /// The algorithm for "satisfying" dependencies is as follows: /// * If dependency.name != supplied.name, continue -/// * If dependency is a variable from a child scope, succeed if supplied.type == dependency.type +/// * If dependency is a variable from a child scope, succeed if `supplied.type == dependency.type` /// * If dependency is not a variable from a child scope, it will be propagated regardless of whether the types /// match (required for cases where a variable from a parent scope is copied to a separate local variable with the same name). -/// * Succeed if supplied.type == dependency.type +/// * Succeed if `supplied.type == dependency.type` pub(crate) fn get_net_dependencies(ast_item: &dyn Dependencies, is_scope: bool) -> Vec { let supplies = ast_item.supplies(); let dependencies = ast_item.dependencies(); @@ -637,7 +639,7 @@ pub(crate) trait Dependencies { /// This function is used to calculate the outstanding dependencies by filtering out identities /// _needed_ from the identities _supplied_ by an AST node. If implementing an AST node that - /// **DIRECTLY** creates a scope (For example, a block of code), this method should be overriden. + /// **DIRECTLY** creates a scope (For example, a block of code), this method should be overridden. /// /// [`get_net_dependencies(self, false)`](get_net_dependencies) is the default implementation. /// Override this function and pass `true` if dealing with an AST node that creates a scope. diff --git a/compiler/src/ast/string.rs b/compiler/src/ast/string.rs index ab61d3b..a503bcc 100644 --- a/compiler/src/ast/string.rs +++ b/compiler/src/ast/string.rs @@ -1,6 +1,5 @@ -use std::borrow::Cow; - use anyhow::Result; +use bytecode::compilation_bridge::split_string_v2; use crate::{ instruction, @@ -29,7 +28,7 @@ impl Dependencies for AstString { } impl IntoType for AstString { - fn for_type(&self) -> Result { + fn for_type(&self) -> Result { match self { AstString::Plain(str) => Ok(TypeLayout::Native(NativeType::Str(StrWrapper(Some( str.len(), @@ -51,7 +50,9 @@ impl Compile for AstString { impl Parser { pub fn string(input: Node) -> Result { let as_str = input.as_str(); - let encoded = bytecode::compilation_bridge::split_string(Cow::Borrowed(as_str))?[0].clone(); + let bridge_parts = split_string_v2(&("\"".to_owned() + as_str + "\""), false)?; + + let encoded = bridge_parts[0].clone(); Ok(AstString::Plain(encoded)) } diff --git a/compiler/src/tests/types.rs b/compiler/src/tests/types.rs index a43a89e..1b1eb5d 100644 --- a/compiler/src/tests/types.rs +++ b/compiler/src/tests/types.rs @@ -207,7 +207,7 @@ fn types_with_reassignment() { } #[test] -#[ignore = "2/4/2024 - type updates are no longer in spec"] +#[should_panic = "type mismatch: this assignment will update a variable with type `int`, which is not compatible with the original type `str`"] fn type_updates() { eval( r#" @@ -224,9 +224,8 @@ fn type_updates() { .unwrap() } -/// `2` is an invalid index into "hi", so this test asserts that types and values are being updated #[test] -#[ignore = "2/4/2024 - type updates are no longer in spec"] +#[should_panic = "type mismatch: this assignment will update a variable with type `[int...]`, which is not compatible with the original type `str`"] fn type_updates_index() { eval( r#" @@ -240,12 +239,12 @@ fn type_updates_index() { } #[test] -#[should_panic = "cannot index with `3` into list whose known valid safe indexes are 0..2"] +#[should_panic = "type mismatch: this assignment will update a variable with type `[int, int, int]`, which is not compatible with the original type `str`"] fn invalid_index() { eval( r#" x = "hi" - x = [1, 2, 3] + const x = [1, 2, 3] x[3] "#, @@ -470,5 +469,72 @@ fn list_with_optional_class() { assert cats_str == ["Cat named Muna is a tabby: Cat", "Cat named Scout is a tabby: Cat", "Cat named Odie is a tabby: Cat", "Cat named Air Bud is a tabby: Cat", "Cat named Old Yeller is a tabby: Cat"] "#, ) - .unwrap(); + .unwrap(); +} + +#[test] +fn escape_sequence_strings() { + eval( + r#" + const MESSAGE = "Santa said, \"Ho Ho Ho!\" (North Pole Chronicle 52).\n\t--12/25/0002" + + assert MESSAGE[12] == "\"" + + const QUOT_ORD: byte = 34.to_byte() + const LF_ORD: byte = 10.to_byte() + const TAB_ORD: byte = 9.to_byte() + + assert MESSAGE[12] == QUOT_ORD.to_ascii() + + assert MESSAGE.index_of("\n") == 50 + assert MESSAGE[50] == LF_ORD.to_ascii() + + assert MESSAGE[51] == TAB_ORD.to_ascii() + "#, + ) + .unwrap() +} + +/// https://github.com/mrodz/mscript/issues/185 +#[test] +fn cross_file_escape_behavior() { + EvalEnvironment::entrypoint( + "main.ms", + r#" + import other + + const MESSAGE = "hello \"world\"" + + print MESSAGE == other.MESSAGE + print MESSAGE + print other.MESSAGE + + me = other.Person("Billy Bob") + + print me.to_str() + assert me.to_str() == "Person {\n\tname: \"Billy Bob\"\n}" + "#, + ) + .unwrap() + .add( + "other.ms", + r#" + export const MESSAGE: str = "hello \"world\"" + + export class Person { + name: str + + constructor(self, name: str) { + self.name = name + } + + fn to_str(self) -> str { + return "Person {\n\tname: \"" + self.name + "\"\n}" + } + } + "#, + ) + .unwrap() + .run() + .unwrap() }