🐛 Fix embedded ebook metadata parsing (#752)

aaronleopold · web-flow · commit c2a13da0d557 · 2025-09-05T17:08:33.000-07:00
* 🐛 Fix embedded ebook metadata parsing

* swap to trace for logs
diff --git a/core/integration-tests/data/calibre-2.opf b/core/integration-tests/data/calibre-2.opf
@@ -0,0 +1,36 @@
+<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uuid_id" prefix="calibre: https://calibre-ebook.com">
+  <metadata xmlns:opf="http://www.idpf.org/2007/opf"
+    xmlns:dc="http://purl.org/dc/elements/1.1/"
+    xmlns:dcterms="http://purl.org/dc/terms/"
+    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+    xmlns:calibre="http://calibre.kovidgoyal.net/2009/metadata">
+    <dc:title id="id">The Long Way to a Small, Angry Planet</dc:title>
+    <dc:creator id="id-1">Becky Chambers</dc:creator>
+    <dc:identifier>isbn:9780062444134</dc:identifier>
+    <dc:identifier>mobi-asin:B00M0DRZ56</dc:identifier>
+    <dc:identifier>calibre:42</dc:identifier>
+    <dc:identifier>uuid:0fbf1b90-f134-4315-9a8a-c135cfe7a605</dc:identifier>
+    <dc:identifier id="uuid_id">uuid:0fbf1b90-f134-4315-9a8a-c135cfe7a605</dc:identifier>
+    <dc:rights>Copyright © Becky Chambers 2014</dc:rights>
+    <dc:language>en</dc:language>
+    <dc:date>2014-07-29T04:00:00+00:00</dc:date>
+    <dc:description>&lt;div&gt;
+&lt;p&gt;&lt;em&gt;Follow a motley crew on an exciting journey through space-and one adventurous young explorer who discovers the meaning of family in the far reaches of the universe-in this light-hearted debut space opera from a rising sci-fi star.&lt;/em&gt;&lt;br&gt;&lt;br&gt;Rosemary Harper doesn't expect much when she joins the crew of the aging Wayfarer. While the patched-up ship has seen better days, it offers her a bed, a chance to explore the far-off corners of the galaxy, and most importantly, some distance from her past. An introspective young woman who learned early to keep to herself, she's never met anyone remotely like the ship's diverse crew. Life aboard the Wayfarer is chaotic and crazy—exactly what Rosemary wants. It's also about to get extremely dangerous when the crew is offered the job of a lifetime.&lt;/p&gt;&lt;/div&gt;</dc:description>
+    <dc:publisher>Harper Voyager</dc:publisher>
+    <dc:subject>Science fiction</dc:subject>
+    <dc:subject>Space Opera</dc:subject>
+    <dc:subject>LGBT</dc:subject>
+    <dc:subject>Fiction</dc:subject>
+    <dc:subject>Queer</dc:subject>
+    <opf:meta refines="#id" property="title-type">main</opf:meta>
+    <opf:meta refines="#id" property="file-as">Long Way to a Small, Angry Planet, The</opf:meta>
+    <meta name="primary-writing-mode" content="horizontal-lr"/>
+    <meta property="dcterms:modified" scheme="dcterms:W3CDTF">2025-09-02T16:04:09Z</meta>
+    <meta property="calibre:timestamp" scheme="dcterms:W3CDTF">2025-06-12T19:14:18Z</meta>
+    <opf:meta refines="#id-1" property="role" scheme="marc:relators">aut</opf:meta>
+    <opf:meta refines="#id-1" property="file-as">Chambers, Becky</opf:meta>
+    <opf:meta property="belongs-to-collection" id="id-2">Wayfarers</opf:meta>
+    <opf:meta refines="#id-2" property="collection-type">series</opf:meta>
+    <opf:meta refines="#id-2" property="group-position">1</opf:meta>
+  </metadata>
+</package>
diff --git a/core/src/filesystem/media/format/epub.rs b/core/src/filesystem/media/format/epub.rs
@@ -91,15 +91,31 @@ impl FileProcessor for EpubProcessor {
 	}
 
 	fn process_metadata(path: &str) -> Result<Option<ProcessedMediaMetadata>, FileError> {
-		let epub_file = Self::open(path)?;
-		let embedded_metadata = ProcessedMediaMetadata::from(epub_file.metadata);
+		let mut epub_file = Self::open(path)?;
+		let mut embedded_metadata =
+			ProcessedMediaMetadata::from(epub_file.metadata.clone());
+
+		tracing::trace!(before = ?embedded_metadata, "Processing embedded metadata");
+
+		let root_file_path = epub_file.root_file.clone();
+		if let Some(Ok(parsed_embedded_metadata)) = epub_file
+			.get_resource_str_by_path(&root_file_path)
+			.map(|xml| parse_opf_xml(&xml))
+		{
+			let additional_metadata =
+				ProcessedMediaMetadata::from(parsed_embedded_metadata);
+			// Prioritize the additional over epub-rs since it is less comprehensive
+			embedded_metadata.merge(additional_metadata);
+		}
+
+		tracing::trace!(after = ?embedded_metadata, "Merged embedded metadata");
 
 		let file_path = std::path::Path::new(path).with_extension("opf");
 		if file_path.exists() {
 			let opf_string = std::fs::read_to_string(file_path)?;
 			let opf_metadata = parse_opf_xml(&opf_string)?;
 
-			// merge opf and embedded, prioritizing opf
+			// Prioritize the OPF metadata over the embedded metadata
 			let opf_metadata = ProcessedMediaMetadata::from(opf_metadata);
 			let mut combined_metadata = opf_metadata.clone();
 
@@ -469,6 +485,10 @@ fn parse_opf_xml(opf_content: &str) -> Result<HashMap<String, Vec<String>>, File
 							let property = String::from_utf8_lossy(&attr.value);
 							current_tag = property.to_string();
 						},
+						b"property" if tag_name == "opf:meta" => {
+							let property = String::from_utf8_lossy(&attr.value);
+							current_tag = property.to_string();
+						},
 						_ => {},
 					}
 				}
@@ -542,10 +562,47 @@ fn parse_opf_xml(opf_content: &str) -> Result<HashMap<String, Vec<String>>, File
 					if let Ok(text) = e.unescape() {
 						let content = text.trim().to_string();
 						if !content.is_empty() {
-							opf_metadata
-								.entry(current_tag.clone())
-								.or_default()
-								.push(content);
+							match current_tag.as_str() {
+								"belongs-to-collection" => {
+									opf_metadata
+										.entry("collection_name".to_string())
+										.or_default()
+										.push(content.clone());
+								},
+								"collection-type" => {
+									opf_metadata
+										.entry("collection_type".to_string())
+										.or_default()
+										.push(content.clone());
+								},
+								"group-position" => {
+									opf_metadata
+										.entry("collection_position".to_string())
+										.or_default()
+										.push(content.clone());
+								},
+								"identifier" => {
+									// Some books seem to have prefixed identifiers (e.g., "isbn:9780062444134")
+									if let Some(colon_pos) = content.find(':') {
+										let scheme = content[..colon_pos].to_lowercase();
+										let value = content[colon_pos + 1..].to_string();
+										let key = format!("identifier_{}", scheme);
+										opf_metadata.entry(key).or_default().push(value);
+									} else {
+										// No prefix, treat as generic identifier
+										opf_metadata
+											.entry(current_tag.clone())
+											.or_default()
+											.push(content);
+									}
+								},
+								_ => {
+									opf_metadata
+										.entry(current_tag.clone())
+										.or_default()
+										.push(content);
+								},
+							}
 						}
 					}
 				}
@@ -916,6 +973,92 @@ mod tests {
 		}
 	}
 
+	#[test]
+	fn test_parse_calibre_3_opf() {
+		let opf_path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+			.join("integration-tests")
+			.join("data")
+			.join("calibre-2.opf");
+
+		let opf_content = std::fs::read_to_string(&opf_path)
+			.expect("Failed to read calibre-2.opf test file");
+
+		let metadata =
+			parse_opf_xml(&opf_content).expect("Failed to parse calibre-3.opf");
+
+		assert_eq!(
+			metadata.get("title"),
+			Some(&vec!["The Long Way to a Small, Angry Planet".to_string()])
+		);
+		assert_eq!(
+			metadata.get("creator"),
+			Some(&vec!["Becky Chambers".to_string()])
+		);
+		assert_eq!(
+			metadata.get("publisher"),
+			Some(&vec!["Harper Voyager".to_string()])
+		);
+		assert_eq!(metadata.get("language"), Some(&vec!["en".to_string()]));
+		assert_eq!(
+			metadata.get("date"),
+			Some(&vec!["2014-07-29T04:00:00+00:00".to_string()])
+		);
+
+		let subjects = metadata.get("subject").expect("Should have subjects");
+		assert_eq!(subjects.len(), 5);
+		assert!(subjects.contains(&"Science fiction".to_string()));
+		assert!(subjects.contains(&"Space Opera".to_string()));
+		assert!(subjects.contains(&"LGBT".to_string()));
+		assert!(subjects.contains(&"Fiction".to_string()));
+		assert!(subjects.contains(&"Queer".to_string()));
+
+		// Test the different format for series info
+		assert_eq!(
+			metadata.get("collection_name"),
+			Some(&vec!["Wayfarers".to_string()])
+		);
+		assert_eq!(
+			metadata.get("collection_type"),
+			Some(&vec!["series".to_string()])
+		);
+		assert_eq!(
+			metadata.get("collection_position"),
+			Some(&vec!["1".to_string()])
+		);
+
+		// Test prefixed identifiers
+		assert_eq!(
+			metadata.get("identifier_isbn"),
+			Some(&vec!["9780062444134".to_string()])
+		);
+		assert_eq!(
+			metadata.get("identifier_mobi-asin"),
+			Some(&vec!["B00M0DRZ56".to_string()])
+		);
+		assert_eq!(
+			metadata.get("identifier_calibre"),
+			Some(&vec!["42".to_string()])
+		);
+
+		let expected_keys = [
+			"title",
+			"creator",
+			"date",
+			"publisher",
+			"language",
+			"subject",
+			"identifier_isbn",
+			"identifier_mobi-asin",
+			"identifier_calibre",
+			"collection_name",
+			"collection_type",
+			"collection_position",
+		];
+		for key in expected_keys.iter() {
+			assert!(metadata.contains_key(*key), "Missing expected key: {}", key);
+		}
+	}
+
 	#[test]
 	fn test_get_page_content_types() {
 		let path = get_test_epub_path();
diff --git a/core/src/filesystem/media/metadata.rs b/core/src/filesystem/media/metadata.rs
@@ -218,16 +218,20 @@ impl From<HashMap<String, Vec<String>>> for ProcessedMediaMetadata {
 			match key.to_lowercase().as_str() {
 				"title" => metadata.title = Some(value.join("\n").to_string()),
 				"title_sort" => metadata.title_sort = Some(value.join("\n").to_string()),
-				"series" => metadata.series = Some(value.join("\n").to_string()),
-				"number" | "series_index" => {
+				"series" | "collection_name" => {
+					metadata.series = Some(value.join("\n").to_string())
+				},
+				"number" | "series_index" | "collection_position" => {
 					metadata.number =
 						value.into_iter().next().and_then(|n| n.parse().ok());
 				},
 				"volume" => {
 					metadata.volume =
 						value.into_iter().next().and_then(|n| n.parse().ok());
 				},
-				"summary" => metadata.summary = Some(value.join("\n").to_string()),
+				"summary" | "description" | "synopsis" => {
+					metadata.summary = Some(value.join("\n").to_string())
+				},
 				"notes" => metadata.notes = Some(value.join("\n").to_string()),
 				"genre" | "genres" | "subject" | "subjects" => {
 					metadata.genres = Some(value)
diff --git a/packages/browser/src/components/markdown/MarkdownPreview.tsx b/packages/browser/src/components/markdown/MarkdownPreview.tsx
@@ -2,6 +2,7 @@
 import { cn, cx, Divider, Heading, Text } from '@stump/components'
 import { forwardRef, PropsWithChildren, useState } from 'react'
 import ReactMarkdown from 'react-markdown'
+import rehypeRaw from 'rehype-raw'
 import remarkDirective from 'remark-directive'
 import remarkDirectiveRehype from 'remark-directive-rehype'
 import remarkGfm from 'remark-gfm'
@@ -15,6 +16,7 @@ export default function MarkdownPreview({ children, className }: Props) {
 	return (
 		<ReactMarkdown
 			remarkPlugins={[remarkDirective, remarkDirectiveRehype, remarkGfm]}
+			rehypePlugins={[rehypeRaw]}
 			className={cn('text-foreground-subtle', className)}
 			components={{
 				h1: ({ ref: _, ...props }) => (