Skip to content

Commit 1636be6

Browse files
svg+html: better handling for comments (#689)
In #193 HTML documents starting with a comment were not detected anymore. This is against mime sniffing standard https://mimesniff.spec.whatwg.org/, but at the time it seemed best solution. file(1) also detects comments as HTML. This commit makes SVG detection skip optional comments and HTML back to detecting those as valid documents.
1 parent abda398 commit 1636be6

File tree

7 files changed

+63
-11
lines changed

7 files changed

+63
-11
lines changed

internal/charset/charset.go

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -209,15 +209,8 @@ func fromHTML(s scan.Bytes) string {
209209
meta := []byte("<META")
210210
lmeta := len(meta)
211211
for {
212-
if len(s) == 0 {
213-
return ""
214-
}
215-
if bytes.HasPrefix(s, []byte("<!--")) {
216-
// Offset by two (<!) because the starting and ending -- can be the same.j
217-
s.Advance(2)
218-
if i := bytes.Index(s, []byte("-->")); i != -1 {
219-
s.Advance(i)
220-
}
212+
if markup.SkipAComment(&s) {
213+
continue
221214
}
222215
if len(s) <= lmeta {
223216
return ""

internal/magic/text.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ var (
2929
[]byte("<BODY"),
3030
[]byte("<BR"),
3131
[]byte("<P"),
32+
[]byte("<!--"),
3233
)
3334
// XML matches an Extensible Markup Language file.
3435
XML = markup([]byte("<?XML"))
@@ -236,13 +237,16 @@ func Svg(raw []byte, limit uint32) bool {
236237
// svgWithoutXMLDeclaration matches a SVG image that does not have an XML header.
237238
// Example:
238239
//
240+
// <!-- xml comment ignored -->
239241
// <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
240242
// <rect fill="#fff" stroke="#000" x="-70" y="-70" width="390" height="390"/>
241243
// </svg>
242244
func svgWithoutXMLDeclaration(s scan.Bytes) bool {
243245
for scan.ByteIsWS(s.Peek()) {
244246
s.Advance(1)
245247
}
248+
for mkup.SkipAComment(&s) {
249+
}
246250
if !bytes.HasPrefix(s, []byte("<svg")) {
247251
return false
248252
}

internal/markup/markup.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
package markup
44

55
import (
6+
"bytes"
7+
68
"github.com/gabriel-vasile/mimetype/internal/scan"
79
)
810

@@ -88,3 +90,14 @@ func getAValue(s *scan.Bytes) (_ []byte, hasMore bool) {
8890
}
8991
}
9092
}
93+
94+
func SkipAComment(s *scan.Bytes) (skipped bool) {
95+
if bytes.HasPrefix(*s, []byte("<!--")) {
96+
// Offset by 2 len(<!) because the starting and ending -- can be the same.
97+
if i := bytes.Index((*s)[2:], []byte("-->")); i != -1 {
98+
s.Advance(i + 2 + 3) // 2 comes from len(<!) and 3 comes from len(-->).
99+
return true
100+
}
101+
}
102+
return false
103+
}

internal/markup/markup_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,3 +229,33 @@ func TestGetAllAttributes(t *testing.T) {
229229
})
230230
}
231231
}
232+
233+
func TestSkipAComment(t *testing.T) {
234+
tcases := []struct {
235+
in string
236+
out string
237+
skipped bool
238+
}{{
239+
"", "", false,
240+
}, {
241+
"abc", "abc", false,
242+
}, {
243+
"<!--", "<!--", false, // not ending comment
244+
}, {
245+
"<!-- abc -->", "", true, // regular comment
246+
}, {
247+
"<!-->", "", true, // the beginning and ending -- are the same chars
248+
}}
249+
for _, tc := range tcases {
250+
t.Run(tc.in, func(t *testing.T) {
251+
s := scan.Bytes(tc.in)
252+
skipped := SkipAComment(&s)
253+
if tc.skipped != skipped {
254+
t.Errorf("skipped got: %v, want: %v", skipped, tc.skipped)
255+
}
256+
if string(s) != tc.out {
257+
t.Errorf("got: %v, want: %v", string(s), tc.out)
258+
}
259+
})
260+
}
261+
}

mimetype_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,12 @@ a,"b`,
132132
"text/html; charset=iso-8859-1",
133133
none,
134134
},
135+
{
136+
"html with comment prefix",
137+
`<!-- this comment should not affect --><html><head>`,
138+
"text/html; charset=utf-8",
139+
none,
140+
},
135141
{"ico 01", "\x00\x00\x01\x00", "image/x-icon", one},
136142
{"ico 02", "\x00\x00\x02\x00", "image/x-icon", none},
137143
{"ics", "BEGIN:VCALENDAR\n00", "text/calendar", one},
@@ -246,6 +252,12 @@ a,"b`,
246252
"image/svg+xml",
247253
all,
248254
},
255+
{
256+
"svg with comment prefix",
257+
`<!-- this comment should not affect --><svg xmlns="http://www.w3.org/2000/svg"`,
258+
"image/svg+xml",
259+
none,
260+
},
249261

250262
{"swf", "CWS", "application/x-shockwave-flash", one},
251263
{"tar", fromDisk("tar.tar"), "application/x-tar", all},

supported_mimes.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ Extension | MIME type | Aliases
147147
**.jxr** | image/jxr | image/vnd.ms-photo
148148
**.parquet** | application/vnd.apache.parquet | application/x-parquet
149149
**.txt** | text/plain | -
150-
**.html** | text/html | -
151150
**.svg** | image/svg+xml | -
151+
**.html** | text/html | -
152152
**.xml** | text/xml | application/xml
153153
**.rss** | application/rss+xml | text/rss
154154
**.atom** | application/atom+xml | -

tree.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ var (
8181
alias("application/x-ogg")
8282
oggAudio = newMIME("audio/ogg", ".oga", magic.OggAudio)
8383
oggVideo = newMIME("video/ogg", ".ogv", magic.OggVideo)
84-
text = newMIME("text/plain", ".txt", magic.Text, html, svg, xml, php, js, lua, perl, python, json, ndJSON, rtf, srt, tcl, csv, tsv, vCard, iCalendar, warc, vtt)
84+
text = newMIME("text/plain", ".txt", magic.Text, svg, html, xml, php, js, lua, perl, python, json, ndJSON, rtf, srt, tcl, csv, tsv, vCard, iCalendar, warc, vtt)
8585
xml = newMIME("text/xml", ".xml", magic.XML, rss, atom, x3d, kml, xliff, collada, gml, gpx, tcx, amf, threemf, xfdf, owl2).
8686
alias("application/xml")
8787
json = newMIME("application/json", ".json", magic.JSON, geoJSON, har, gltf)

0 commit comments

Comments
 (0)