From 703caf37c49517ef72550cf310ff781eab90c4a2 Mon Sep 17 00:00:00 2001
From: Jack O'Connor <oconnor663@gmail.com>
Date: Tue, 5 May 2020 12:03:09 -0400
Subject: [PATCH] prevent some panics when parsing invalid Unicode on Windows

Previously, OsStr::starts_with() would panic on Windows if the OsStr
contained any invalid Unicode. Because this method is in the critical
path for parsing command line arguments, Clap up to v2.33.0 on Windows
panics on non-Unicode filenames, even when the application specifically
requests OsString to account for this case. As a workaround, this commit
adds a custom implementation of OsStr::starts_with() for Windows that
doesn't rely on OsStr::as_bytes().

With this change, examples like the following can parse successfully.
Here "X" represents invalid Unicode:

    clap.exe X
    clap.exe --some-arg X

However, examples like these will still panic:

    clap.exe --some-arg=X
    clap.exe -sX

These still panic, because they require string splitting operations like
OsStr::split_at_byte() and OsStr::trim_left_matches(). Fixing these
would require either breaking open the in-memory representation of
OsStr, which is not stable, or changing all these APIs to allow for an
allocated OsString/Cow<OsStr>. This fix is aiming to be minimal and also
not wildly unsafe, so we leave these bugs in place. Hopefully the
majority of invalid Unicode in the wild occurs in filepaths given as
standalone arguments, and many of the rest can be converted from flags
with `=` to flags with a space.

Fixes https://github.com/clap-rs/clap/issues/1905.
---
 src/osstringext.rs |  81 +++++++++++++++++++++++++++++
 tests/utf16.rs     | 123 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 tests/utf16.rs
diff --git a/src/osstringext.rs b/src/osstringext.rs
index 50b0970fc03..ee0de302e2a 100644
--- a/src/osstringext.rs
+++ b/src/osstringext.rs
@@ -20,6 +20,73 @@ pub trait OsStrExt2 {
     fn split(&self, b: u8) -> OsSplit;
 }
 
+// A starts-with implementation that does not panic when the OsStr contains
+// invalid Unicode.
+//
+// A Windows OsStr is usually UTF-16. If `prefix` is valid UTF-8, we can
+// re-encode it as UTF-16, and ask whether `osstr` starts with the same series
+// of u16 code units. If `prefix` is not valid UTF-8, then this comparison
+// isn't meaningful, and we just return false.
+#[cfg(target_os = "windows")]
+fn windows_osstr_starts_with(osstr: &OsStr, prefix: &[u8]) -> bool {
+    use std::os::windows::ffi::OsStrExt;
+    let prefix_str = if let Ok(s) = std::str::from_utf8(prefix) {
+        s
+    } else {
+        return false;
+    };
+    let mut osstr_units = osstr.encode_wide();
+    let mut prefix_units = prefix_str.encode_utf16();
+    loop {
+        match (osstr_units.next(), prefix_units.next()) {
+            // These code units match. Keep looping.
+            (Some(o), Some(p)) if o == p => continue,
+            // We've reached the end of the prefix. It's a match.
+            (_, None) => return true,
+            // Otherwise, it's not a match.
+            _ => return false,
+        }
+    }
+}
+
+#[test]
+#[cfg(target_os = "windows")]
+fn test_windows_osstr_starts_with() {
+    use std::ffi::OsString;
+    use std::os::windows::ffi::OsStringExt;
+
+    fn from_ascii(ascii: &[u8]) -> OsString {
+        let u16_vec: Vec<u16> = ascii.iter().map(|&c| c as u16).collect();
+        OsString::from_wide(&u16_vec)
+    }
+
+    // Test all the basic cases.
+    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abc"));
+    assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abcdef"));
+    assert!(!windows_osstr_starts_with(&from_ascii(b"abcdef"), b"def"));
+    assert!(!windows_osstr_starts_with(&from_ascii(b"abc"), b"abcd"));
+
+    // Test the case where the candidate prefix is not valid UTF-8. Note that a
+    // standalone \xff byte is valid ASCII but not valid UTF-8. Thus although
+    // these strings look identical, they do not match.
+    assert!(!windows_osstr_starts_with(&from_ascii(b"\xff"), b"\xff"));
+
+    // Test the case where the OsString is not valid UTF-16. It should still be
+    // possible to match the valid characters at the front.
+    //
+    // UTF-16 surrogate characters are only valid in pairs. Including one on
+    // the end by itself makes this invalid UTF-16.
+    let surrogate_char: u16 = 0xDC00;
+    let mut invalid_unicode =
+        OsString::from_wide(&['a' as u16, 'b' as u16, 'c' as u16, surrogate_char]);
+    assert!(
+        invalid_unicode.to_str().is_none(),
+        "This string is invalid Unicode, and conversion to &str should fail.",
+    );
+    assert!(windows_osstr_starts_with(&invalid_unicode, b"abc"));
+    assert!(!windows_osstr_starts_with(&invalid_unicode, b"abcd"));
+}
+
 #[cfg(any(target_os = "windows", target_arch = "wasm32"))]
 impl OsStrExt3 for OsStr {
     fn from_bytes(b: &[u8]) -> &Self {
@@ -33,6 +100,20 @@ impl OsStrExt3 for OsStr {
 
 impl OsStrExt2 for OsStr {
     fn starts_with(&self, s: &[u8]) -> bool {
+        #[cfg(target_os = "windows")]
+        {
+            // On Windows, the as_bytes() method will panic if the OsStr
+            // contains invalid Unicode. To avoid this, we use a
+            // Windows-specific starts-with function that doesn't rely on
+            // as_bytes(). This is necessary for Windows command line
+            // applications to handle non-Unicode arguments successfully. This
+            // allows common cases like `clap.exe [invalid]` to succeed, though
+            // cases that require string splitting will still fail, like
+            // `clap.exe --arg=[invalid]`. Note that this entire module is
+            // replaced in Clap 3.x, so this workaround is specific to the 2.x
+            // branch.
+            return windows_osstr_starts_with(self, s);
+        }
         self.as_bytes().starts_with(s)
     }
 
diff --git a/tests/utf16.rs b/tests/utf16.rs
new file mode 100644
index 00000000000..38dec0ae976
--- /dev/null
+++ b/tests/utf16.rs
@@ -0,0 +1,123 @@
+//! These Windows-only tests are ported from the non-Windows tests in
+//! tests/utf8.rs. Two categories of tests are omitted, however:
+//!
+//! 1. We don't include the tests that use StrictUtf8 mode, since that's
+//!    eplicitly Unix-only.
+//! 2. We don't include tests that require splitting invalid Unicode strings.
+//!
+//! Elaborating on that second point: We have fixed invalid Unicode handling in
+//! the OsStr::starts_with() method. That means that examples like these can
+//! should parse successfully ("X" represents invalid Unicode):
+//!
+//!     clap.exe X
+//!     clap.exe --some-arg X
+//!
+//! However, other string handling methods like OsStr::split_at_byte() and
+//! OsStr::trim_left_matches() still panic in the face of invalid Unicode. That
+//! means examples like these *don't* work:
+//!
+//!     clap.exe --some-arg=X
+//!     clap.exe -sX
+//!
+//! Thus this file omits tests for those cases. All of this OsStr handling is
+//! being rewritten for the 3.0 release in any case, so this limitation is
+//! specific to the 2.x series.
+
+#![cfg(windows)]
+
+extern crate clap;
+
+use clap::{App, Arg};
+use std::ffi::OsString;
+use std::os::windows::ffi::OsStringExt;
+
+fn bad_utf16_string() -> OsString {
+    // UTF-16 surrogate characters are only valid in pairs. Including one on
+    // the end by itself makes this invalid UTF-16.
+    let surrogate_char: u16 = 0xDC00;
+    let os = OsString::from_wide(&[surrogate_char]);
+    assert!(os.to_str().is_none(), "should be invalid Unicode");
+    os
+}
+
+#[test]
+fn invalid_utf16_lossy_positional() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("<arg> 'some arg'"))
+        .get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_lossy_option_short_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("-a"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_lossy_option_long_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("--arg"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
+}
+
+#[test]
+fn invalid_utf16_positional() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("<arg> 'some arg'"))
+        .get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}
+
+#[test]
+fn invalid_utf16_option_short_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("-a"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}
+
+#[test]
+fn invalid_utf16_option_long_space() {
+    let r = App::new("bad_utf16")
+        .arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
+        .get_matches_from_safe(vec![
+            OsString::from(""),
+            OsString::from("--arg"),
+            bad_utf16_string(),
+        ]);
+    assert!(r.is_ok());
+    let m = r.unwrap();
+    assert!(m.is_present("arg"));
+    assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
+}