Skip to content

Commit 703caf3

Browse files
committed
prevent some panics when parsing invalid Unicode on Windows
Previously, OsStr::starts_with() would panic on Windows if the OsStr contained any invalid Unicode. Because this method is in the critical path for parsing command line arguments, Clap up to v2.33.0 on Windows panics on non-Unicode filenames, even when the application specifically requests OsString to account for this case. As a workaround, this commit adds a custom implementation of OsStr::starts_with() for Windows that doesn't rely on OsStr::as_bytes(). With this change, examples like the following can parse successfully. Here "X" represents invalid Unicode: clap.exe X clap.exe --some-arg X However, examples like these will still panic: clap.exe --some-arg=X clap.exe -sX These still panic, because they require string splitting operations like OsStr::split_at_byte() and OsStr::trim_left_matches(). Fixing these would require either breaking open the in-memory representation of OsStr, which is not stable, or changing all these APIs to allow for an allocated OsString/Cow<OsStr>. This fix is aiming to be minimal and also not wildly unsafe, so we leave these bugs in place. Hopefully the majority of invalid Unicode in the wild occurs in filepaths given as standalone arguments, and many of the rest can be converted from flags with `=` to flags with a space. Fixes #1905.
1 parent 0129fe5 commit 703caf3

File tree

2 files changed

+204
-0
lines changed

2 files changed

+204
-0
lines changed

src/osstringext.rs

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,73 @@ pub trait OsStrExt2 {
2020
fn split(&self, b: u8) -> OsSplit;
2121
}
2222

23+
// A starts-with implementation that does not panic when the OsStr contains
24+
// invalid Unicode.
25+
//
26+
// A Windows OsStr is usually UTF-16. If `prefix` is valid UTF-8, we can
27+
// re-encode it as UTF-16, and ask whether `osstr` starts with the same series
28+
// of u16 code units. If `prefix` is not valid UTF-8, then this comparison
29+
// isn't meaningful, and we just return false.
30+
#[cfg(target_os = "windows")]
31+
fn windows_osstr_starts_with(osstr: &OsStr, prefix: &[u8]) -> bool {
32+
use std::os::windows::ffi::OsStrExt;
33+
let prefix_str = if let Ok(s) = std::str::from_utf8(prefix) {
34+
s
35+
} else {
36+
return false;
37+
};
38+
let mut osstr_units = osstr.encode_wide();
39+
let mut prefix_units = prefix_str.encode_utf16();
40+
loop {
41+
match (osstr_units.next(), prefix_units.next()) {
42+
// These code units match. Keep looping.
43+
(Some(o), Some(p)) if o == p => continue,
44+
// We've reached the end of the prefix. It's a match.
45+
(_, None) => return true,
46+
// Otherwise, it's not a match.
47+
_ => return false,
48+
}
49+
}
50+
}
51+
52+
#[test]
53+
#[cfg(target_os = "windows")]
54+
fn test_windows_osstr_starts_with() {
55+
use std::ffi::OsString;
56+
use std::os::windows::ffi::OsStringExt;
57+
58+
fn from_ascii(ascii: &[u8]) -> OsString {
59+
let u16_vec: Vec<u16> = ascii.iter().map(|&c| c as u16).collect();
60+
OsString::from_wide(&u16_vec)
61+
}
62+
63+
// Test all the basic cases.
64+
assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abc"));
65+
assert!(windows_osstr_starts_with(&from_ascii(b"abcdef"), b"abcdef"));
66+
assert!(!windows_osstr_starts_with(&from_ascii(b"abcdef"), b"def"));
67+
assert!(!windows_osstr_starts_with(&from_ascii(b"abc"), b"abcd"));
68+
69+
// Test the case where the candidate prefix is not valid UTF-8. Note that a
70+
// standalone \xff byte is valid ASCII but not valid UTF-8. Thus although
71+
// these strings look identical, they do not match.
72+
assert!(!windows_osstr_starts_with(&from_ascii(b"\xff"), b"\xff"));
73+
74+
// Test the case where the OsString is not valid UTF-16. It should still be
75+
// possible to match the valid characters at the front.
76+
//
77+
// UTF-16 surrogate characters are only valid in pairs. Including one on
78+
// the end by itself makes this invalid UTF-16.
79+
let surrogate_char: u16 = 0xDC00;
80+
let mut invalid_unicode =
81+
OsString::from_wide(&['a' as u16, 'b' as u16, 'c' as u16, surrogate_char]);
82+
assert!(
83+
invalid_unicode.to_str().is_none(),
84+
"This string is invalid Unicode, and conversion to &str should fail.",
85+
);
86+
assert!(windows_osstr_starts_with(&invalid_unicode, b"abc"));
87+
assert!(!windows_osstr_starts_with(&invalid_unicode, b"abcd"));
88+
}
89+
2390
#[cfg(any(target_os = "windows", target_arch = "wasm32"))]
2491
impl OsStrExt3 for OsStr {
2592
fn from_bytes(b: &[u8]) -> &Self {
@@ -33,6 +100,20 @@ impl OsStrExt3 for OsStr {
33100

34101
impl OsStrExt2 for OsStr {
35102
fn starts_with(&self, s: &[u8]) -> bool {
103+
#[cfg(target_os = "windows")]
104+
{
105+
// On Windows, the as_bytes() method will panic if the OsStr
106+
// contains invalid Unicode. To avoid this, we use a
107+
// Windows-specific starts-with function that doesn't rely on
108+
// as_bytes(). This is necessary for Windows command line
109+
// applications to handle non-Unicode arguments successfully. This
110+
// allows common cases like `clap.exe [invalid]` to succeed, though
111+
// cases that require string splitting will still fail, like
112+
// `clap.exe --arg=[invalid]`. Note that this entire module is
113+
// replaced in Clap 3.x, so this workaround is specific to the 2.x
114+
// branch.
115+
return windows_osstr_starts_with(self, s);
116+
}
36117
self.as_bytes().starts_with(s)
37118
}
38119

tests/utf16.rs

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
//! These Windows-only tests are ported from the non-Windows tests in
2+
//! tests/utf8.rs. Two categories of tests are omitted, however:
3+
//!
4+
//! 1. We don't include the tests that use StrictUtf8 mode, since that's
5+
//! eplicitly Unix-only.
6+
//! 2. We don't include tests that require splitting invalid Unicode strings.
7+
//!
8+
//! Elaborating on that second point: We have fixed invalid Unicode handling in
9+
//! the OsStr::starts_with() method. That means that examples like these can
10+
//! should parse successfully ("X" represents invalid Unicode):
11+
//!
12+
//! clap.exe X
13+
//! clap.exe --some-arg X
14+
//!
15+
//! However, other string handling methods like OsStr::split_at_byte() and
16+
//! OsStr::trim_left_matches() still panic in the face of invalid Unicode. That
17+
//! means examples like these *don't* work:
18+
//!
19+
//! clap.exe --some-arg=X
20+
//! clap.exe -sX
21+
//!
22+
//! Thus this file omits tests for those cases. All of this OsStr handling is
23+
//! being rewritten for the 3.0 release in any case, so this limitation is
24+
//! specific to the 2.x series.
25+
26+
#![cfg(windows)]
27+
28+
extern crate clap;
29+
30+
use clap::{App, Arg};
31+
use std::ffi::OsString;
32+
use std::os::windows::ffi::OsStringExt;
33+
34+
fn bad_utf16_string() -> OsString {
35+
// UTF-16 surrogate characters are only valid in pairs. Including one on
36+
// the end by itself makes this invalid UTF-16.
37+
let surrogate_char: u16 = 0xDC00;
38+
let os = OsString::from_wide(&[surrogate_char]);
39+
assert!(os.to_str().is_none(), "should be invalid Unicode");
40+
os
41+
}
42+
43+
#[test]
44+
fn invalid_utf16_lossy_positional() {
45+
let r = App::new("bad_utf16")
46+
.arg(Arg::from_usage("<arg> 'some arg'"))
47+
.get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
48+
assert!(r.is_ok());
49+
let m = r.unwrap();
50+
assert!(m.is_present("arg"));
51+
assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
52+
}
53+
54+
#[test]
55+
fn invalid_utf16_lossy_option_short_space() {
56+
let r = App::new("bad_utf16")
57+
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
58+
.get_matches_from_safe(vec![
59+
OsString::from(""),
60+
OsString::from("-a"),
61+
bad_utf16_string(),
62+
]);
63+
assert!(r.is_ok());
64+
let m = r.unwrap();
65+
assert!(m.is_present("arg"));
66+
assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
67+
}
68+
69+
#[test]
70+
fn invalid_utf16_lossy_option_long_space() {
71+
let r = App::new("bad_utf16")
72+
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
73+
.get_matches_from_safe(vec![
74+
OsString::from(""),
75+
OsString::from("--arg"),
76+
bad_utf16_string(),
77+
]);
78+
assert!(r.is_ok());
79+
let m = r.unwrap();
80+
assert!(m.is_present("arg"));
81+
assert_eq!(&*m.value_of_lossy("arg").unwrap(), "\u{FFFD}");
82+
}
83+
84+
#[test]
85+
fn invalid_utf16_positional() {
86+
let r = App::new("bad_utf16")
87+
.arg(Arg::from_usage("<arg> 'some arg'"))
88+
.get_matches_from_safe(vec![OsString::from(""), bad_utf16_string()]);
89+
assert!(r.is_ok());
90+
let m = r.unwrap();
91+
assert!(m.is_present("arg"));
92+
assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
93+
}
94+
95+
#[test]
96+
fn invalid_utf16_option_short_space() {
97+
let r = App::new("bad_utf16")
98+
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
99+
.get_matches_from_safe(vec![
100+
OsString::from(""),
101+
OsString::from("-a"),
102+
bad_utf16_string(),
103+
]);
104+
assert!(r.is_ok());
105+
let m = r.unwrap();
106+
assert!(m.is_present("arg"));
107+
assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
108+
}
109+
110+
#[test]
111+
fn invalid_utf16_option_long_space() {
112+
let r = App::new("bad_utf16")
113+
.arg(Arg::from_usage("-a, --arg <arg> 'some arg'"))
114+
.get_matches_from_safe(vec![
115+
OsString::from(""),
116+
OsString::from("--arg"),
117+
bad_utf16_string(),
118+
]);
119+
assert!(r.is_ok());
120+
let m = r.unwrap();
121+
assert!(m.is_present("arg"));
122+
assert_eq!(&*m.value_of_os("arg").unwrap(), &*bad_utf16_string());
123+
}

0 commit comments

Comments
 (0)