@@ -15,17 +15,37 @@ function get_first_paragraph( string $html ): string {
15
15
return '' ;
16
16
}
17
17
18
- $ doc = new DOMDocument ();
18
+ $ cache_key = md5 ( $ html );
19
+ $ cached = wp_cache_get ( $ cache_key );
20
+
21
+ if ( $ cached ) {
22
+ return $ cached ;
23
+ }
24
+
25
+ $ doc = new DOMDocument ( '1.0 ' , 'utf-8 ' );
19
26
20
27
// phpcs:disable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
21
28
$ doc ->formatOutput = false ;
22
29
$ doc ->substituteEntities = false ;
23
- // phpcs:enable WordPress.NamingConventions.ValidVariableName.UsedPropertyNotSnakeCase
30
+ $ doc ->preserveWhiteSpace = true ;
31
+ $ doc ->validateOnParse = false ;
32
+
33
+ // ensure string is utf8
34
+ $ encoded_content = mb_convert_encoding ( $ html , 'UTF-8 ' );
35
+ // encode everything
36
+ $ encoded_content = htmlentities ( $ encoded_content , ENT_NOQUOTES , 'UTF-8 ' );
37
+ // decode "standard" characters
38
+ $ encoded_content = htmlspecialchars_decode ( $ encoded_content , ENT_NOQUOTES );
39
+ // convert left side of ISO-8859-1 to HTML numeric character reference
40
+ // this was taken from PHP docs for mb_encode_numericentity vvvvvvvvvvvvvvvvvvvvvvvvv
41
+ $ encoded_content = mb_encode_numericentity ( $ encoded_content , [ 0x80 , 0x10FFFF , 0 , ~0 ], 'UTF-8 ' );
24
42
25
43
libxml_use_internal_errors ( true );
26
44
$ doc ->loadHTML (
27
- $ html ,
28
- LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD | LIBXML_NOXMLDECL | LIBXML_NOERROR | LIBXML_NOWARNING
45
+ '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body> ' .
46
+ $ encoded_content .
47
+ '</body> ' ,
48
+ LIBXML_HTML_NODEFDTD | LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOENT
29
49
);
30
50
libxml_use_internal_errors ( false );
31
51
@@ -39,7 +59,10 @@ function get_first_paragraph( string $html ): string {
39
59
continue ;
40
60
}
41
61
42
- return wp_kses_post ( sprintf ( '<p>%s</p> ' , $ innards ) );
62
+ $ output = sprintf ( '<p>%s</p> ' , $ innards );
63
+
64
+ wp_cache_add ( $ cache_key , $ output );
65
+ return wp_kses_post ( $ output );
43
66
}
44
67
45
68
return '' ;
0 commit comments