Skip to content

Commit

Permalink
prevent some panics when parsing invalid Unicode on Windows
Browse files Browse the repository at this point in the history
Previously, OsStr::starts_with() would panic on Windows if the OsStr
contained any invalid Unicode. Because this method is in the critical
path for parsing command line arguments, Clap up to v2.33.0 on Windows
panics on non-Unicode filenames, even when the application specifically
requests OsString to account for this case. As a workaround, this commit
adds a custom implementation of OsStr::starts_with() for Windows that
doesn't rely on OsStr::as_bytes().

With this change, examples like the following can parse successfully.
Here "X" represents invalid Unicode:

    clap.exe X
    clap.exe --some-arg X

However, examples like these will still panic:

    clap.exe --some-arg=X
    clap.exe -sX

These still panic, because they require string splitting operations like
OsStr::split_at_byte() and OsStr::trim_left_matches(). Fixing these
would require either breaking open the in-memory representation of
OsStr, which is not stable, or changing all these APIs to allow for an
allocated OsString/Cow<OsStr>. This fix is aiming to be minimal and also
not wildly unsafe, so we leave these bugs in place. Hopefully the
majority of invalid Unicode in the wild occurs in filepaths given as
standalone arguments, and many of the rest can be converted from flags
with `=` to flags with a space.

Fixes clap-rs#1905.
  • Loading branch information
oconnor663 committed May 5, 2020
1 parent 0129fe5 commit 703caf3
Show file tree
Hide file tree
Showing 2 changed files with 204 additions and 0 deletions.
81 changes: 81 additions & 0 deletions src/osstringext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,73 @@ pub trait OsStrExt2 {
fn split(&self, b: u8) -> OsSplit;
}

// A starts-with implementation that does not panic when the OsStr contains
// invalid Unicode.
//
// A Windows OsStr is usually UTF-16. If `prefix` is valid UTF-8, we can
// re-encode it as UTF-16, and ask whether `osstr` starts with the same series
// of u16 code units. If `prefix` is not valid UTF-8, then this comparison
// isn't meaningful, and we just return false.
#[cfg(target_os = "windows")]
fn windows_osstr_starts_with(osstr: &OsStr, prefix: &[u8]) -> bool {
use std::os::windows::ffi::OsStrExt;
let prefix_str = if let Ok(s) = std::str::from_utf8(prefix) {
s
} else {
return false;
};
let mut osstr_units = osstr.encode_wide();
let mut prefix_units = prefix_str.encode_utf16();
loop {
match (osstr_units.next(), prefix_units.next()) {
// These code units match. Keep looping.
(Some(o), Some(p)) if o == p => continue,
// We've reached the end of the prefix. It's a match.
(_, None) => return true,
// Otherwise, it's not a match.
_ => return false,
}
}
}

#[test]
#[cfg(target_os = "windows")]
fn test_windows_osstr_starts_with() {
use std::ffi::OsString;
use std::os::windows::ffi::OsStringExt;

fn from_ascii(ascii: &[u8]) -> OsString {
let u16_vec: Vec<u16> = ascii.iter().map(|&c| c as u16).collect();
OsString::from_wide(&u16_vec)
}

// Test all the basic cases.
assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abc"));
assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abcdef"));
assert!(!windows_osstr_starts_with(&from_ascii(b"abcdef"), b"def"));
assert!(!windows_osstr_starts_with(&from_ascii(b"abc"), b"abcd"));

// Test the case where the candidate prefix is not valid UTF-8. Note that a
// standalone \xff byte is valid ASCII but not valid UTF-8. Thus although
// these strings look identical, they do not match.
assert!(!windows_osstr_starts_with(&from_ascii(b"\xff"), b"\xff"));

// Test the case where the OsString is not valid UTF-16. It should still be
// possible to match the valid characters at the front.
//
// UTF-16 surrogate characters are only valid in pairs. Including one on
// the end by itself makes this invalid UTF-16.
let surrogate_char: u16 = 0xDC00;
let mut invalid_unicode =
OsString::from_wide(&['a' as u16, 'b' as u16, 'c' as u16, surrogate_char]);
assert!(
invalid_unicode.to_str().is_none(),
"This string is invalid Unicode, and conversion to &str should fail.",
);
assert!(windows_osstr_starts_with(&invalid_unicode, b"abc"));
assert!(!windows_osstr_starts_with(&invalid_unicode, b"abcd"));
}

#[cfg(any(target_os = "windows", target_arch = "wasm32"))]
impl OsStrExt3 for OsStr {
fn from_bytes(b: &[u8]) -> &Self {
Expand All @@ -33,6 +100,20 @@ impl OsStrExt3 for OsStr {

impl OsStrExt2 for OsStr {
fn starts_with(&self, s: &[u8]) -> bool {
#[cfg(target_os = "windows")]
{
// On Windows, the as_bytes() method will panic if the OsStr
// contains invalid Unicode. To avoid this, we use a
// Windows-specific starts-with function that doesn't rely on
// as_bytes(). This is necessary for Windows command line
// applications to handle non-Unicode arguments successfully. This
// allows common cases like `clap.exe [invalid]` to succeed, though
// cases that require string splitting will still fail, like
// `clap.exe --arg=[invalid]`. Note that this entire module is
// replaced in Clap 3.x, so this workaround is specific to the 2.x
// branch.
return windows_osstr_starts_with(self, s);
}
self.as_bytes().starts_with(s)
}

Expand Down
123 changes: 123 additions & 0 deletions tests/utf16.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
//! These Windows-only tests are ported from the non-Windows tests in
//! tests/utf8.rs. Two categories of tests are omitted, however:
//!
//! 1. We don't include the tests that use StrictUtf8 mode, since that's
//! eplicitly Unix-only.
//! 2. We don't include tests that require splitting invalid Unicode strings.
//!
//! Elaborating on that second point: We have fixed invalid Unicode handling in
//! the OsStr::starts_with() method. That means that examples like these can
//! should parse successfully ("X" represents invalid Unicode):
//!
//! clap.exe X
//! clap.exe --some-arg X
//!
//! However, other string handling methods like OsStr::split_at_byte() and
//! OsStr::trim_left_matches() still panic in the face of invalid Unicode. That
//! means examples like these *don't* work:
//!
//! clap.exe --some-arg=X
//! clap.exe -sX
//!
//! Thus this file omits tests for those cases. All of this OsStr handling is
//! being rewritten for the 3.0 release in any case, so this limitation is
//! specific to the 2.x series.
#![cfg(windows)]

extern crate clap;

use clap::{App, Arg};
use std::ffi::OsString;
use std::os::windows::ffi::OsStringExt;

fn bad_utf16_string() -> OsString {
// UTF-16 surrogate characters are only valid in pairs. Including one on
// the end by itself makes this invalid UTF-16.
let surrogate_char: u16 = 0xDC00;
let os = OsString::from_wide(&[surrogate_char]);
assert!(os.to_str().is_none(), "should be invalid Unicode");
os
}

#[test]
fn invalid_utf16_lossy_positional() {
let r = App::new("bad_utf16")
.arg(Arg::from_usage("<arg> 'some arg'"))
.get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
assert!(r.is_ok());
let m = r.unwrap();
assert!(m.is_present("arg"));
assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
}

#[test]
fn invalid_utf16_lossy_option_short_space() {
let r = App::new("bad_utf16")
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
.get_matches_from_safe(vec![
OsString::from(""),
OsString::from("-a"),
bad_utf16_string(),
]);
assert!(r.is_ok());
let m = r.unwrap();
assert!(m.is_present("arg"));
assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
}

#[test]
fn invalid_utf16_lossy_option_long_space() {
let r = App::new("bad_utf16")
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
.get_matches_from_safe(vec![
OsString::from(""),
OsString::from("--arg"),
bad_utf16_string(),
]);
assert!(r.is_ok());
let m = r.unwrap();
assert!(m.is_present("arg"));
assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
}

#[test]
fn invalid_utf16_positional() {
let r = App::new("bad_utf16")
.arg(Arg::from_usage("<arg> 'some arg'"))
.get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
assert!(r.is_ok());
let m = r.unwrap();
assert!(m.is_present("arg"));
assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
}

#[test]
fn invalid_utf16_option_short_space() {
let r = App::new("bad_utf16")
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
.get_matches_from_safe(vec![
OsString::from(""),
OsString::from("-a"),
bad_utf16_string(),
]);
assert!(r.is_ok());
let m = r.unwrap();
assert!(m.is_present("arg"));
assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
}

#[test]
fn invalid_utf16_option_long_space() {
let r = App::new("bad_utf16")
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
.get_matches_from_safe(vec![
OsString::from(""),
OsString::from("--arg"),
bad_utf16_string(),
]);
assert!(r.is_ok());
let m = r.unwrap();
assert!(m.is_present("arg"));
assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
}

0 comments on commit 703caf3

Please sign in to comment.