Merge pull request #1907 from oconnor663/windows_2.33

clap-rs · May 11, 2020 · 922c645 · 922c645
2 parents 0129fe5 + 703caf3
commit 922c645
Show file tree

Hide file tree

Showing 2 changed files with 204 additions and 0 deletions.
diff --git a/src/osstringext.rs b/src/osstringext.rs
@@ -20,6 +20,73 @@ pub trait OsStrExt2 {
     fn split(&self, b: u8) -> OsSplit;
 }
 
+// A starts-with implementation that does not panic when the OsStr contains
+// invalid Unicode.
+//
+// A Windows OsStr is usually UTF-16. If `prefix` is valid UTF-8, we can
+// re-encode it as UTF-16, and ask whether `osstr` starts with the same series
+// of u16 code units. If `prefix` is not valid UTF-8, then this comparison
+// isn't meaningful, and we just return false.
+#[cfg(target_os = "windows")]
+fn windows_osstr_starts_with(osstr: &OsStr, prefix: &[u8]) -> bool {
+    use std::os::windows::ffi::OsStrExt;
+    let prefix_str = if let Ok(s) = std::str::from_utf8(prefix) {
+        s
+    } else {
+        return false;
+    };
+    let mut osstr_units = osstr.encode_wide();
+    let mut prefix_units = prefix_str.encode_utf16();
+    loop {
+        match (osstr_units.next(), prefix_units.next()) {
+            // These code units match. Keep looping.
+            (Some(o), Some(p)) if o == p => continue,
+            // We've reached the end of the prefix. It's a match.
+            (_, None) => return true,
+            // Otherwise, it's not a match.
+            _ => return false,
+        }
+    }
+}
+
+#[test]
+#[cfg(target_os = "windows")]
+fn test_windows_osstr_starts_with() {
+    use std::ffi::OsString;
+    use std::os::windows::ffi::OsStringExt;
+
+    fn from_ascii(ascii: &[u8]) -> OsString {
+        let u16_vec: Vec<u16> = ascii.iter().map(|&c| c as u16).collect();
+        OsString::from_wide(&u16_vec)
+    }
+
+    // Test all the basic cases.
+    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abc"));
+    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abcdef"));
+    assert!(!windows_osstr_starts_with(&from_ascii(b"abcdef"), b"def"));
+    assert!(!windows_osstr_starts_with(&from_ascii(b"abc"), b"abcd"));
+
+    // Test the case where the candidate prefix is not valid UTF-8. Note that a
+    // standalone \xff byte is valid ASCII but not valid UTF-8. Thus although
+    // these strings look identical, they do not match.
+    assert!(!windows_osstr_starts_with(&from_ascii(b"\xff"), b"\xff"));
+
+    // Test the case where the OsString is not valid UTF-16. It should still be
+    // possible to match the valid characters at the front.
+    //
+    // UTF-16 surrogate characters are only valid in pairs. Including one on
+    // the end by itself makes this invalid UTF-16.
+    let surrogate_char: u16 = 0xDC00;
+    let mut invalid_unicode =
+        OsString::from_wide(&['a' as u16, 'b' as u16, 'c' as u16, surrogate_char]);
+    assert!(
+        invalid_unicode.to_str().is_none(),
+        "This string is invalid Unicode, and conversion to &str should fail.",
+    );
+    assert!(windows_osstr_starts_with(&invalid_unicode, b"abc"));
+    assert!(!windows_osstr_starts_with(&invalid_unicode, b"abcd"));
+}
+
 #[cfg(any(target_os = "windows", target_arch = "wasm32"))]
 impl OsStrExt3 for OsStr {
     fn from_bytes(b: &[u8]) -> &Self {
@@ -33,6 +100,20 @@ impl OsStrExt3 for OsStr {
 
 impl OsStrExt2 for OsStr {
     fn starts_with(&self, s: &[u8]) -> bool {
+        #[cfg(target_os = "windows")]
+        {
+            // On Windows, the as_bytes() method will panic if the OsStr
+            // contains invalid Unicode. To avoid this, we use a
+            // Windows-specific starts-with function that doesn't rely on
+            // as_bytes(). This is necessary for Windows command line
+            // applications to handle non-Unicode arguments successfully. This
+            // allows common cases like `clap.exe [invalid]` to succeed, though
+            // cases that require string splitting will still fail, like
+            // `clap.exe --arg=[invalid]`. Note that this entire module is
+            // replaced in Clap 3.x, so this workaround is specific to the 2.x
+            // branch.
+            return windows_osstr_starts_with(self, s);
+        }
         self.as_bytes().starts_with(s)
     }
 

diff --git a/tests/utf16.rs b/tests/utf16.rs
@@ -0,0 +1,123 @@
+//! These Windows-only tests are ported from the non-Windows tests in
+//! tests/utf8.rs. Two categories of tests are omitted, however:
+//!
+//! 1. We don't include the tests that use StrictUtf8 mode, since that's
+//!    eplicitly Unix-only.
+//! 2. We don't include tests that require splitting invalid Unicode strings.
+//!
+//! Elaborating on that second point: We have fixed invalid Unicode handling in
+//! the OsStr::starts_with() method. That means that examples like these can
+//! should parse successfully ("X" represents invalid Unicode):
+//!
+//!     clap.exe X
+//!     clap.exe --some-arg X
+//!
+//! However, other string handling methods like OsStr::split_at_byte() and
+//! OsStr::trim_left_matches() still panic in the face of invalid Unicode. That
+//! means examples like these *don't* work:
+//!
+//!     clap.exe --some-arg=X
+//!     clap.exe -sX
+//!
+//! Thus this file omits tests for those cases. All of this OsStr handling is
+//! being rewritten for the 3.0 release in any case, so this limitation is
+//! specific to the 2.x series.
+
+#![cfg(windows)]
+
+extern crate clap;
+
+use clap::{App, Arg};
+use std::ffi::OsString;
+use std::os::windows::ffi::OsStringExt;
+
+fn bad_utf16_string() -> OsString {
+    // UTF-16 surrogate characters are only valid in pairs. Including one on
+    // the end by itself makes this invalid UTF-16.
+    let surrogate_char: u16 = 0xDC00;
+    let os = OsString::from_wide(&[surrogate_char]);
+    assert!(os.to_str().is_none(), "should be invalid Unicode");
+    os
+}
+
+#[test]
+fn invalid_utf16_lossy_positional() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("<arg> 'some arg'"))
+        .get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_lossy_option_short_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("-a"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_lossy_option_long_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("--arg"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_positional() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("<arg> 'some arg'"))
+        .get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}
+
+#[test]
+fn invalid_utf16_option_short_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("-a"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}
+
+#[test]
+fn invalid_utf16_option_long_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("--arg"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}