Skip to content

Commit 0d0dee6

Browse files
authored
Support detecting doc/xls/ppt (#38)
* Check docProps in msooxml matcher * Allow clippy::upper-case-acronyms lint * Fix typo: OOXLM -> OOXML * Support detecting doc/xls/ppt
1 parent fa860c4 commit 0d0dee6

File tree

4 files changed

+78
-36
lines changed

4 files changed

+78
-36
lines changed

Cargo.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,13 @@ exclude = ["testdata/*", "tests/*"]
1414

1515
[features]
1616
default = ["std"]
17-
std = ["alloc"]
17+
std = ["alloc", "cfb"]
1818
alloc = []
1919

2020
[[example]]
2121
name = "file"
2222
path = "examples/file.rs"
2323
required-features = ["std"]
24+
25+
[dependencies]
26+
cfb = { version = "0.4.0", optional = true }

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ assert_eq!(kind.extension(), "foo");
209209

210210
## Known Issues
211211

212-
- `doc`, `ppt`, `xls`, `msi` all have the same magic number so it's not possible to tell which one just based on the binary data. `doc` is returned for all.
213212
- `exe` and `dll` have the same magic number so it's not possible to tell which one just based on the binary data. `exe` is returned for all.
214213

215214
## License

src/matchers/doc.rs

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,18 @@ use core::convert::TryInto;
33
#[allow(clippy::upper_case_acronyms)]
44
#[derive(Debug, Eq, PartialEq)]
55
enum DocType {
6-
// DOC,
6+
DOC,
77
DOCX,
8-
// XLS,
8+
XLS,
99
XLSX,
10-
// PPT,
10+
PPT,
1111
PPTX,
12-
OOXLM,
12+
OOXML,
1313
}
1414

1515
/// Returns whether a buffer is Microsoft Word Document (DOC) data.
1616
pub fn is_doc(buf: &[u8]) -> bool {
17-
buf.len() > 7
18-
&& buf[0] == 0xD0
19-
&& buf[1] == 0xCF
20-
&& buf[2] == 0x11
21-
&& buf[3] == 0xE0
22-
&& buf[4] == 0xA1
23-
&& buf[5] == 0xB1
24-
&& buf[6] == 0x1A
25-
&& buf[7] == 0xE1
17+
ole2(buf) == Some(DocType::DOC)
2618
}
2719

2820
/// Returns whether a buffer is Microsoft Word Open XML Format Document (DOCX) data.
@@ -32,15 +24,7 @@ pub fn is_docx(buf: &[u8]) -> bool {
3224

3325
/// Returns whether a buffer is Microsoft Excel 97-2003 Worksheet (XLS) data.
3426
pub fn is_xls(buf: &[u8]) -> bool {
35-
buf.len() > 7
36-
&& buf[0] == 0xD0
37-
&& buf[1] == 0xCF
38-
&& buf[2] == 0x11
39-
&& buf[3] == 0xE0
40-
&& buf[4] == 0xA1
41-
&& buf[5] == 0xB1
42-
&& buf[6] == 0x1A
43-
&& buf[7] == 0xE1
27+
ole2(buf) == Some(DocType::XLS)
4428
}
4529

4630
/// Returns whether a buffer is Microsoft Excel Open XML Format Spreadsheet (XLSX) data.
@@ -50,15 +34,7 @@ pub fn is_xlsx(buf: &[u8]) -> bool {
5034

5135
/// Returns whether a buffer is Microsoft PowerPoint 97-2003 Presentation (PPT) data.
5236
pub fn is_ppt(buf: &[u8]) -> bool {
53-
buf.len() > 7
54-
&& buf[0] == 0xD0
55-
&& buf[1] == 0xCF
56-
&& buf[2] == 0x11
57-
&& buf[3] == 0xE0
58-
&& buf[4] == 0xA1
59-
&& buf[5] == 0xB1
60-
&& buf[6] == 0x1A
61-
&& buf[7] == 0xE1
37+
ole2(buf) == Some(DocType::PPT)
6238
}
6339

6440
/// Returns whether a buffer is Microsoft PowerPoint Open XML Presentation (PPTX) data.
@@ -108,15 +84,43 @@ fn msooxml(buf: &[u8]) -> Option<DocType> {
10884
let idx = search(buf, start_offset, 6000);
10985
match idx {
11086
Some(idx) => start_offset += idx + 4 + 26,
111-
None => return Some(DocType::OOXLM),
87+
None => return Some(DocType::OOXML),
11288
};
11389

11490
let typo = check_msooml(buf, start_offset);
11591
if typo.is_some() {
11692
return typo;
11793
}
11894

119-
Some(DocType::OOXLM)
95+
Some(DocType::OOXML)
96+
}
97+
98+
#[cfg(feature = "std")]
99+
fn ole2(buf: &[u8]) -> Option<DocType> {
100+
use std::io::Cursor;
101+
102+
if !compare_bytes(buf, &[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 0) {
103+
return None;
104+
}
105+
if let Ok(file) = cfb::CompoundFile::open(Cursor::new(buf)) {
106+
return match file.root_entry().clsid().to_string().as_str() {
107+
"00020810-0000-0000-c000-000000000046" | "00020820-0000-0000-c000-000000000046" => {
108+
Some(DocType::XLS)
109+
}
110+
"00020906-0000-0000-c000-000000000046" => Some(DocType::DOC),
111+
"64818d10-4f9b-11cf-86ea-00aa00b929e8" => Some(DocType::PPT),
112+
_ => None,
113+
};
114+
}
115+
None
116+
}
117+
118+
#[cfg(not(feature = "std"))]
119+
fn ole2(buf: &[u8]) -> Option<DocType> {
120+
if !compare_bytes(buf, &[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1], 0) {
121+
return None;
122+
}
123+
Some(DocType::DOC)
120124
}
121125

122126
fn compare_bytes(slice: &[u8], sub_slice: &[u8], start_offset: usize) -> bool {

tests/doc.rs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,30 @@
11
mod common;
22

3-
test_format!(DOC, "application/msword", "doc", doc, "sample.doc");
3+
#[cfg(feature = "std")]
4+
macro_rules! test_format_get_only {
5+
($exp_matchert:ident, $exp_mimet:expr, $exp_ext:expr, $format:ident, $file:expr) => {
6+
mod $format {
7+
use infer::{MatcherType, Type};
8+
9+
fn matcher(_buf: &[u8]) -> bool {
10+
false
11+
}
12+
13+
#[test]
14+
fn get() {
15+
let expected_kind =
16+
Type::new(MatcherType::$exp_matchert, $exp_mimet, $exp_ext, matcher);
17+
let buf = include_bytes!(concat!("../testdata/", $file));
18+
let kind = infer::get(buf).expect("test file matches");
19+
20+
assert_eq!(expected_kind, kind);
21+
}
22+
}
23+
};
24+
}
25+
26+
#[cfg(feature = "std")]
27+
test_format_get_only!(DOC, "application/msword", "doc", doc, "sample.doc");
428

529
test_format!(
630
DOC,
@@ -10,6 +34,9 @@ test_format!(
1034
"sample.docx"
1135
);
1236

37+
#[cfg(feature = "std")]
38+
test_format_get_only!(DOC, "application/vnd.ms-excel", "xls", xls, "sample.xls");
39+
1340
test_format!(
1441
DOC,
1542
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
@@ -18,6 +45,15 @@ test_format!(
1845
"sample.xlsx"
1946
);
2047

48+
#[cfg(feature = "std")]
49+
test_format_get_only!(
50+
DOC,
51+
"application/vnd.ms-powerpoint",
52+
"ppt",
53+
ppt,
54+
"sample.ppt"
55+
);
56+
2157
test_format!(
2258
DOC,
2359
"application/vnd.openxmlformats-officedocument.presentationml.presentation",

0 commit comments

Comments
 (0)