@@ -91,15 +91,31 @@ impl FileProcessor for EpubProcessor {
91
91
}
92
92
93
93
fn process_metadata ( path : & str ) -> Result < Option < ProcessedMediaMetadata > , FileError > {
94
- let epub_file = Self :: open ( path) ?;
95
- let embedded_metadata = ProcessedMediaMetadata :: from ( epub_file. metadata ) ;
94
+ let mut epub_file = Self :: open ( path) ?;
95
+ let mut embedded_metadata =
96
+ ProcessedMediaMetadata :: from ( epub_file. metadata . clone ( ) ) ;
97
+
98
+ tracing:: trace!( before = ?embedded_metadata, "Processing embedded metadata" ) ;
99
+
100
+ let root_file_path = epub_file. root_file . clone ( ) ;
101
+ if let Some ( Ok ( parsed_embedded_metadata) ) = epub_file
102
+ . get_resource_str_by_path ( & root_file_path)
103
+ . map ( |xml| parse_opf_xml ( & xml) )
104
+ {
105
+ let additional_metadata =
106
+ ProcessedMediaMetadata :: from ( parsed_embedded_metadata) ;
107
+ // Prioritize the additional over epub-rs since it is less comprehensive
108
+ embedded_metadata. merge ( additional_metadata) ;
109
+ }
110
+
111
+ tracing:: trace!( after = ?embedded_metadata, "Merged embedded metadata" ) ;
96
112
97
113
let file_path = std:: path:: Path :: new ( path) . with_extension ( "opf" ) ;
98
114
if file_path. exists ( ) {
99
115
let opf_string = std:: fs:: read_to_string ( file_path) ?;
100
116
let opf_metadata = parse_opf_xml ( & opf_string) ?;
101
117
102
- // merge opf and embedded, prioritizing opf
118
+ // Prioritize the OPF metadata over the embedded metadata
103
119
let opf_metadata = ProcessedMediaMetadata :: from ( opf_metadata) ;
104
120
let mut combined_metadata = opf_metadata. clone ( ) ;
105
121
@@ -469,6 +485,10 @@ fn parse_opf_xml(opf_content: &str) -> Result<HashMap<String, Vec<String>>, File
469
485
let property = String :: from_utf8_lossy ( & attr. value ) ;
470
486
current_tag = property. to_string ( ) ;
471
487
} ,
488
+ b"property" if tag_name == "opf:meta" => {
489
+ let property = String :: from_utf8_lossy ( & attr. value ) ;
490
+ current_tag = property. to_string ( ) ;
491
+ } ,
472
492
_ => { } ,
473
493
}
474
494
}
@@ -542,10 +562,47 @@ fn parse_opf_xml(opf_content: &str) -> Result<HashMap<String, Vec<String>>, File
542
562
if let Ok ( text) = e. unescape ( ) {
543
563
let content = text. trim ( ) . to_string ( ) ;
544
564
if !content. is_empty ( ) {
545
- opf_metadata
546
- . entry ( current_tag. clone ( ) )
547
- . or_default ( )
548
- . push ( content) ;
565
+ match current_tag. as_str ( ) {
566
+ "belongs-to-collection" => {
567
+ opf_metadata
568
+ . entry ( "collection_name" . to_string ( ) )
569
+ . or_default ( )
570
+ . push ( content. clone ( ) ) ;
571
+ } ,
572
+ "collection-type" => {
573
+ opf_metadata
574
+ . entry ( "collection_type" . to_string ( ) )
575
+ . or_default ( )
576
+ . push ( content. clone ( ) ) ;
577
+ } ,
578
+ "group-position" => {
579
+ opf_metadata
580
+ . entry ( "collection_position" . to_string ( ) )
581
+ . or_default ( )
582
+ . push ( content. clone ( ) ) ;
583
+ } ,
584
+ "identifier" => {
585
+ // Some books seem to have prefixed identifiers (e.g., "isbn:9780062444134")
586
+ if let Some ( colon_pos) = content. find ( ':' ) {
587
+ let scheme = content[ ..colon_pos] . to_lowercase ( ) ;
588
+ let value = content[ colon_pos + 1 ..] . to_string ( ) ;
589
+ let key = format ! ( "identifier_{}" , scheme) ;
590
+ opf_metadata. entry ( key) . or_default ( ) . push ( value) ;
591
+ } else {
592
+ // No prefix, treat as generic identifier
593
+ opf_metadata
594
+ . entry ( current_tag. clone ( ) )
595
+ . or_default ( )
596
+ . push ( content) ;
597
+ }
598
+ } ,
599
+ _ => {
600
+ opf_metadata
601
+ . entry ( current_tag. clone ( ) )
602
+ . or_default ( )
603
+ . push ( content) ;
604
+ } ,
605
+ }
549
606
}
550
607
}
551
608
}
@@ -916,6 +973,92 @@ mod tests {
916
973
}
917
974
}
918
975
976
+ #[ test]
977
+ fn test_parse_calibre_3_opf ( ) {
978
+ let opf_path = std:: path:: Path :: new ( env ! ( "CARGO_MANIFEST_DIR" ) )
979
+ . join ( "integration-tests" )
980
+ . join ( "data" )
981
+ . join ( "calibre-2.opf" ) ;
982
+
983
+ let opf_content = std:: fs:: read_to_string ( & opf_path)
984
+ . expect ( "Failed to read calibre-2.opf test file" ) ;
985
+
986
+ let metadata =
987
+ parse_opf_xml ( & opf_content) . expect ( "Failed to parse calibre-3.opf" ) ;
988
+
989
+ assert_eq ! (
990
+ metadata. get( "title" ) ,
991
+ Some ( & vec![ "The Long Way to a Small, Angry Planet" . to_string( ) ] )
992
+ ) ;
993
+ assert_eq ! (
994
+ metadata. get( "creator" ) ,
995
+ Some ( & vec![ "Becky Chambers" . to_string( ) ] )
996
+ ) ;
997
+ assert_eq ! (
998
+ metadata. get( "publisher" ) ,
999
+ Some ( & vec![ "Harper Voyager" . to_string( ) ] )
1000
+ ) ;
1001
+ assert_eq ! ( metadata. get( "language" ) , Some ( & vec![ "en" . to_string( ) ] ) ) ;
1002
+ assert_eq ! (
1003
+ metadata. get( "date" ) ,
1004
+ Some ( & vec![ "2014-07-29T04:00:00+00:00" . to_string( ) ] )
1005
+ ) ;
1006
+
1007
+ let subjects = metadata. get ( "subject" ) . expect ( "Should have subjects" ) ;
1008
+ assert_eq ! ( subjects. len( ) , 5 ) ;
1009
+ assert ! ( subjects. contains( & "Science fiction" . to_string( ) ) ) ;
1010
+ assert ! ( subjects. contains( & "Space Opera" . to_string( ) ) ) ;
1011
+ assert ! ( subjects. contains( & "LGBT" . to_string( ) ) ) ;
1012
+ assert ! ( subjects. contains( & "Fiction" . to_string( ) ) ) ;
1013
+ assert ! ( subjects. contains( & "Queer" . to_string( ) ) ) ;
1014
+
1015
+ // Test the different format for series info
1016
+ assert_eq ! (
1017
+ metadata. get( "collection_name" ) ,
1018
+ Some ( & vec![ "Wayfarers" . to_string( ) ] )
1019
+ ) ;
1020
+ assert_eq ! (
1021
+ metadata. get( "collection_type" ) ,
1022
+ Some ( & vec![ "series" . to_string( ) ] )
1023
+ ) ;
1024
+ assert_eq ! (
1025
+ metadata. get( "collection_position" ) ,
1026
+ Some ( & vec![ "1" . to_string( ) ] )
1027
+ ) ;
1028
+
1029
+ // Test prefixed identifiers
1030
+ assert_eq ! (
1031
+ metadata. get( "identifier_isbn" ) ,
1032
+ Some ( & vec![ "9780062444134" . to_string( ) ] )
1033
+ ) ;
1034
+ assert_eq ! (
1035
+ metadata. get( "identifier_mobi-asin" ) ,
1036
+ Some ( & vec![ "B00M0DRZ56" . to_string( ) ] )
1037
+ ) ;
1038
+ assert_eq ! (
1039
+ metadata. get( "identifier_calibre" ) ,
1040
+ Some ( & vec![ "42" . to_string( ) ] )
1041
+ ) ;
1042
+
1043
+ let expected_keys = [
1044
+ "title" ,
1045
+ "creator" ,
1046
+ "date" ,
1047
+ "publisher" ,
1048
+ "language" ,
1049
+ "subject" ,
1050
+ "identifier_isbn" ,
1051
+ "identifier_mobi-asin" ,
1052
+ "identifier_calibre" ,
1053
+ "collection_name" ,
1054
+ "collection_type" ,
1055
+ "collection_position" ,
1056
+ ] ;
1057
+ for key in expected_keys. iter ( ) {
1058
+ assert ! ( metadata. contains_key( * key) , "Missing expected key: {}" , key) ;
1059
+ }
1060
+ }
1061
+
919
1062
#[ test]
920
1063
fn test_get_page_content_types ( ) {
921
1064
let path = get_test_epub_path ( ) ;
0 commit comments