Skip to content

Commit c2a13da

Browse files
authored
🐛 Fix embedded ebook metadata parsing (#752)
* 🐛 Fix embedded ebook metadata parsing * swap to trace for logs
1 parent bc41519 commit c2a13da

File tree

4 files changed

+195
-10
lines changed

4 files changed

+195
-10
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uuid_id" prefix="calibre: https://calibre-ebook.com">
2+
<metadata xmlns:opf="http://www.idpf.org/2007/opf"
3+
xmlns:dc="http://purl.org/dc/elements/1.1/"
4+
xmlns:dcterms="http://purl.org/dc/terms/"
5+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
6+
xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
7+
<dc:title id="id">The Long Way to a Small, Angry Planet</dc:title>
8+
<dc:creator id="id-1">Becky Chambers</dc:creator>
9+
<dc:identifier>isbn:9780062444134</dc:identifier>
10+
<dc:identifier>mobi-asin:B00M0DRZ56</dc:identifier>
11+
<dc:identifier>calibre:42</dc:identifier>
12+
<dc:identifier>uuid:0fbf1b90-f134-4315-9a8a-c135cfe7a605</dc:identifier>
13+
<dc:identifier id="uuid_id">uuid:0fbf1b90-f134-4315-9a8a-c135cfe7a605</dc:identifier>
14+
<dc:rights>Copyright © Becky Chambers 2014</dc:rights>
15+
<dc:language>en</dc:language>
16+
<dc:date>2014-07-29T04:00:00+00:00</dc:date>
17+
<dc:description>&lt;div&gt;
18+
&lt;p&gt;&lt;em&gt;Follow a motley crew on an exciting journey through space-and one adventurous young explorer who discovers the meaning of family in the far reaches of the universe-in this light-hearted debut space opera from a rising sci-fi star.&lt;/em&gt;&lt;br&gt;&lt;br&gt;Rosemary Harper doesn't expect much when she joins the crew of the aging Wayfarer. While the patched-up ship has seen better days, it offers her a bed, a chance to explore the far-off corners of the galaxy, and most importantly, some distance from her past. An introspective young woman who learned early to keep to herself, she's never met anyone remotely like the ship's diverse crew. Life aboard the Wayfarer is chaotic and crazy—exactly what Rosemary wants. It's also about to get extremely dangerous when the crew is offered the job of a lifetime.&lt;/p&gt;&lt;/div&gt;</dc:description>
19+
<dc:publisher>Harper Voyager</dc:publisher>
20+
<dc:subject>Science fiction</dc:subject>
21+
<dc:subject>Space Opera</dc:subject>
22+
<dc:subject>LGBT</dc:subject>
23+
<dc:subject>Fiction</dc:subject>
24+
<dc:subject>Queer</dc:subject>
25+
<opf:meta refines="#id" property="title-type">main</opf:meta>
26+
<opf:meta refines="#id" property="file-as">Long Way to a Small, Angry Planet, The</opf:meta>
27+
<meta name="primary-writing-mode" content="horizontal-lr"/>
28+
<meta property="dcterms:modified" scheme="dcterms:W3CDTF">2025-09-02T16:04:09Z</meta>
29+
<meta property="calibre:timestamp" scheme="dcterms:W3CDTF">2025-06-12T19:14:18Z</meta>
30+
<opf:meta refines="#id-1" property="role" scheme="marc:relators">aut</opf:meta>
31+
<opf:meta refines="#id-1" property="file-as">Chambers, Becky</opf:meta>
32+
<opf:meta property="belongs-to-collection" id="id-2">Wayfarers</opf:meta>
33+
<opf:meta refines="#id-2" property="collection-type">series</opf:meta>
34+
<opf:meta refines="#id-2" property="group-position">1</opf:meta>
35+
</metadata>
36+
</package>

core/src/filesystem/media/format/epub.rs

Lines changed: 150 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,31 @@ impl FileProcessor for EpubProcessor {
9191
}
9292

9393
fn process_metadata(path: &str) -> Result<Option<ProcessedMediaMetadata>, FileError> {
94-
let epub_file = Self::open(path)?;
95-
let embedded_metadata = ProcessedMediaMetadata::from(epub_file.metadata);
94+
let mut epub_file = Self::open(path)?;
95+
let mut embedded_metadata =
96+
ProcessedMediaMetadata::from(epub_file.metadata.clone());
97+
98+
tracing::trace!(before = ?embedded_metadata, "Processing embedded metadata");
99+
100+
let root_file_path = epub_file.root_file.clone();
101+
if let Some(Ok(parsed_embedded_metadata)) = epub_file
102+
.get_resource_str_by_path(&root_file_path)
103+
.map(|xml| parse_opf_xml(&xml))
104+
{
105+
let additional_metadata =
106+
ProcessedMediaMetadata::from(parsed_embedded_metadata);
107+
// Prioritize the additional over epub-rs since it is less comprehensive
108+
embedded_metadata.merge(additional_metadata);
109+
}
110+
111+
tracing::trace!(after = ?embedded_metadata, "Merged embedded metadata");
96112

97113
let file_path = std::path::Path::new(path).with_extension("opf");
98114
if file_path.exists() {
99115
let opf_string = std::fs::read_to_string(file_path)?;
100116
let opf_metadata = parse_opf_xml(&opf_string)?;
101117

102-
// merge opf and embedded, prioritizing opf
118+
// Prioritize the OPF metadata over the embedded metadata
103119
let opf_metadata = ProcessedMediaMetadata::from(opf_metadata);
104120
let mut combined_metadata = opf_metadata.clone();
105121

@@ -469,6 +485,10 @@ fn parse_opf_xml(opf_content: &str) -> Result<HashMap<String, Vec<String>>, File
469485
let property = String::from_utf8_lossy(&attr.value);
470486
current_tag = property.to_string();
471487
},
488+
b"property" if tag_name == "opf:meta" => {
489+
let property = String::from_utf8_lossy(&attr.value);
490+
current_tag = property.to_string();
491+
},
472492
_ => {},
473493
}
474494
}
@@ -542,10 +562,47 @@ fn parse_opf_xml(opf_content: &str) -> Result<HashMap<String, Vec<String>>, File
542562
if let Ok(text) = e.unescape() {
543563
let content = text.trim().to_string();
544564
if !content.is_empty() {
545-
opf_metadata
546-
.entry(current_tag.clone())
547-
.or_default()
548-
.push(content);
565+
match current_tag.as_str() {
566+
"belongs-to-collection" => {
567+
opf_metadata
568+
.entry("collection_name".to_string())
569+
.or_default()
570+
.push(content.clone());
571+
},
572+
"collection-type" => {
573+
opf_metadata
574+
.entry("collection_type".to_string())
575+
.or_default()
576+
.push(content.clone());
577+
},
578+
"group-position" => {
579+
opf_metadata
580+
.entry("collection_position".to_string())
581+
.or_default()
582+
.push(content.clone());
583+
},
584+
"identifier" => {
585+
// Some books seem to have prefixed identifiers (e.g., "isbn:9780062444134")
586+
if let Some(colon_pos) = content.find(':') {
587+
let scheme = content[..colon_pos].to_lowercase();
588+
let value = content[colon_pos + 1..].to_string();
589+
let key = format!("identifier_{}", scheme);
590+
opf_metadata.entry(key).or_default().push(value);
591+
} else {
592+
// No prefix, treat as generic identifier
593+
opf_metadata
594+
.entry(current_tag.clone())
595+
.or_default()
596+
.push(content);
597+
}
598+
},
599+
_ => {
600+
opf_metadata
601+
.entry(current_tag.clone())
602+
.or_default()
603+
.push(content);
604+
},
605+
}
549606
}
550607
}
551608
}
@@ -916,6 +973,92 @@ mod tests {
916973
}
917974
}
918975

976+
#[test]
977+
fn test_parse_calibre_3_opf() {
978+
let opf_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
979+
.join("integration-tests")
980+
.join("data")
981+
.join("calibre-2.opf");
982+
983+
let opf_content = std::fs::read_to_string(&opf_path)
984+
.expect("Failed to read calibre-2.opf test file");
985+
986+
let metadata =
987+
parse_opf_xml(&opf_content).expect("Failed to parse calibre-3.opf");
988+
989+
assert_eq!(
990+
metadata.get("title"),
991+
Some(&vec!["The Long Way to a Small, Angry Planet".to_string()])
992+
);
993+
assert_eq!(
994+
metadata.get("creator"),
995+
Some(&vec!["Becky Chambers".to_string()])
996+
);
997+
assert_eq!(
998+
metadata.get("publisher"),
999+
Some(&vec!["Harper Voyager".to_string()])
1000+
);
1001+
assert_eq!(metadata.get("language"), Some(&vec!["en".to_string()]));
1002+
assert_eq!(
1003+
metadata.get("date"),
1004+
Some(&vec!["2014-07-29T04:00:00+00:00".to_string()])
1005+
);
1006+
1007+
let subjects = metadata.get("subject").expect("Should have subjects");
1008+
assert_eq!(subjects.len(), 5);
1009+
assert!(subjects.contains(&"Science fiction".to_string()));
1010+
assert!(subjects.contains(&"Space Opera".to_string()));
1011+
assert!(subjects.contains(&"LGBT".to_string()));
1012+
assert!(subjects.contains(&"Fiction".to_string()));
1013+
assert!(subjects.contains(&"Queer".to_string()));
1014+
1015+
// Test the different format for series info
1016+
assert_eq!(
1017+
metadata.get("collection_name"),
1018+
Some(&vec!["Wayfarers".to_string()])
1019+
);
1020+
assert_eq!(
1021+
metadata.get("collection_type"),
1022+
Some(&vec!["series".to_string()])
1023+
);
1024+
assert_eq!(
1025+
metadata.get("collection_position"),
1026+
Some(&vec!["1".to_string()])
1027+
);
1028+
1029+
// Test prefixed identifiers
1030+
assert_eq!(
1031+
metadata.get("identifier_isbn"),
1032+
Some(&vec!["9780062444134".to_string()])
1033+
);
1034+
assert_eq!(
1035+
metadata.get("identifier_mobi-asin"),
1036+
Some(&vec!["B00M0DRZ56".to_string()])
1037+
);
1038+
assert_eq!(
1039+
metadata.get("identifier_calibre"),
1040+
Some(&vec!["42".to_string()])
1041+
);
1042+
1043+
let expected_keys = [
1044+
"title",
1045+
"creator",
1046+
"date",
1047+
"publisher",
1048+
"language",
1049+
"subject",
1050+
"identifier_isbn",
1051+
"identifier_mobi-asin",
1052+
"identifier_calibre",
1053+
"collection_name",
1054+
"collection_type",
1055+
"collection_position",
1056+
];
1057+
for key in expected_keys.iter() {
1058+
assert!(metadata.contains_key(*key), "Missing expected key: {}", key);
1059+
}
1060+
}
1061+
9191062
#[test]
9201063
fn test_get_page_content_types() {
9211064
let path = get_test_epub_path();

core/src/filesystem/media/metadata.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -218,16 +218,20 @@ impl From<HashMap<String, Vec<String>>> for ProcessedMediaMetadata {
218218
match key.to_lowercase().as_str() {
219219
"title" => metadata.title = Some(value.join("\n").to_string()),
220220
"title_sort" => metadata.title_sort = Some(value.join("\n").to_string()),
221-
"series" => metadata.series = Some(value.join("\n").to_string()),
222-
"number" | "series_index" => {
221+
"series" | "collection_name" => {
222+
metadata.series = Some(value.join("\n").to_string())
223+
},
224+
"number" | "series_index" | "collection_position" => {
223225
metadata.number =
224226
value.into_iter().next().and_then(|n| n.parse().ok());
225227
},
226228
"volume" => {
227229
metadata.volume =
228230
value.into_iter().next().and_then(|n| n.parse().ok());
229231
},
230-
"summary" => metadata.summary = Some(value.join("\n").to_string()),
232+
"summary" | "description" | "synopsis" => {
233+
metadata.summary = Some(value.join("\n").to_string())
234+
},
231235
"notes" => metadata.notes = Some(value.join("\n").to_string()),
232236
"genre" | "genres" | "subject" | "subjects" => {
233237
metadata.genres = Some(value)

packages/browser/src/components/markdown/MarkdownPreview.tsx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import { cn, cx, Divider, Heading, Text } from '@stump/components'
33
import { forwardRef, PropsWithChildren, useState } from 'react'
44
import ReactMarkdown from 'react-markdown'
5+
import rehypeRaw from 'rehype-raw'
56
import remarkDirective from 'remark-directive'
67
import remarkDirectiveRehype from 'remark-directive-rehype'
78
import remarkGfm from 'remark-gfm'
@@ -15,6 +16,7 @@ export default function MarkdownPreview({ children, className }: Props) {
1516
return (
1617
<ReactMarkdown
1718
remarkPlugins={[remarkDirective, remarkDirectiveRehype, remarkGfm]}
19+
rehypePlugins={[rehypeRaw]}
1820
className={cn('text-foreground-subtle', className)}
1921
components={{
2022
h1: ({ ref: _, ...props }) => (

0 commit comments

Comments
 (0)