Skip to content

Commit 4149fed

Browse files
TylerBarnesgatsbybot
authored andcommitted
fix(gatsby-source-wordpress): HTML image regex's (#29778)
Co-authored-by: gatsbybot <[email protected]> (cherry picked from commit f6edccf)
1 parent a39ff9b commit 4149fed

File tree

3 files changed

+68
-31
lines changed

3 files changed

+68
-31
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import {
2+
getImgSrcRemoteFileMatchesFromNodeString,
3+
getImgTagMatchesWithUrl,
4+
} from "../dist/steps/source-nodes/create-nodes/process-node"
5+
6+
test(`HTML image transformation regex matches images`, async () => {
7+
const wpUrl = `http://wp.fakesite.com`
8+
9+
const nodeString = `<img src=\\"https://wp.fakesite.com/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />
10+
11+
<img src=\\"http://wp.fakesite.com/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />
12+
13+
<img src=\\"/wp-content/uploads/2020/01/©SDM-Yep-©Hi-000-Header.jpg />`
14+
15+
const matches = getImgSrcRemoteFileMatchesFromNodeString(nodeString)
16+
17+
expect(matches.length).toBe(3)
18+
19+
const imgTagMatches = getImgTagMatchesWithUrl({
20+
nodeString,
21+
wpUrl,
22+
})
23+
24+
expect(imgTagMatches.length).toBe(3)
25+
})

packages/gatsby-source-wordpress/src/steps/source-nodes/create-nodes/process-node.js

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* eslint-disable no-useless-escape */
12
import { isWebUri } from "valid-url"
23
import { fluid } from "gatsby-plugin-sharp"
34
import Img from "gatsby-image"
@@ -30,7 +31,7 @@ const getNodeEditLink = node => {
3031

3132
const findReferencedImageNodeIds = ({ nodeString, pluginOptions, node }) => {
3233
// if the lazyNodes plugin option is set we don't need to find
33-
// image node id's because those nodes will be fetched lazily in resolvers
34+
// image node id's because those nodes will be fetched lazily in resolvers.
3435
if (pluginOptions.type.MediaItem.lazyNodes) {
3536
return []
3637
}
@@ -327,6 +328,17 @@ const getCheerioElementFromMatch = wpUrl => ({ match, tag = `img` }) => {
327328
}
328329
}
329330

331+
const getCheerioElementsFromMatches = ({ imgTagMatches, wpUrl }) =>
332+
imgTagMatches
333+
.map(getCheerioElementFromMatch(wpUrl))
334+
.filter(({ cheerioImg: { attribs } }) => {
335+
if (!attribs.src) {
336+
return false
337+
}
338+
339+
return isWebUri(encodeURI(attribs.src))
340+
})
341+
330342
const getLargestSizeFromSizesAttribute = sizesString => {
331343
const sizesStringsArray = sizesString.split(`,`)
332344

@@ -444,6 +456,28 @@ const cacheCreatedFileNodeBySrc = ({ node, src }) => {
444456
}
445457
}
446458

459+
const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:[^'"])*\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" \.]*|)(?=\\"| |\.)/gim
460+
461+
export const getImgSrcRemoteFileMatchesFromNodeString = nodeString =>
462+
execall(imgSrcRemoteFileRegex, nodeString).filter(({ subMatches }) => {
463+
// if our match is json encoded, that means it's inside a JSON
464+
// encoded string field.
465+
const isInJSON = subMatches[0].includes(`\\/\\/`)
466+
467+
// we shouldn't process encoded JSON, so skip this match if it's JSON
468+
return !isInJSON
469+
})
470+
471+
export const getImgTagMatchesWithUrl = ({ nodeString, wpUrl }) =>
472+
execall(
473+
/<img([\w\W]+?)[\/]?>/gim,
474+
nodeString
475+
// we don't want to match images inside pre
476+
.replace(/<pre([\w\W]+?)[\/]?>.*(<\/pre>)/gim, ``)
477+
// and code tags, so temporarily remove those tags and everything inside them
478+
.replace(/<code([\w\W]+?)[\/]?>.*(<\/code>)/gim, ``)
479+
).filter(filterMatches(wpUrl))
480+
447481
const replaceNodeHtmlImages = async ({
448482
nodeString,
449483
node,
@@ -456,38 +490,15 @@ const replaceNodeHtmlImages = async ({
456490
return nodeString
457491
}
458492

459-
const imgSrcRemoteFileRegex = /(?:src=\\")((?:(?:https?|ftp|file):\/\/|www\.|ftp\.|\/)(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#/%=~_|$?!:,.]*\)|[A-Z0-9+&@#/%=~_|$])\.(?:jpeg|jpg|png|gif|ico|mpg|ogv|svg|bmp|tif|tiff))(\?[^\\" .]*|)(?=\\"| |\.)/gim
493+
const imageUrlMatches = getImgSrcRemoteFileMatchesFromNodeString(nodeString)
460494

461-
const imageUrlMatches = execall(imgSrcRemoteFileRegex, nodeString).filter(
462-
({ subMatches }) => {
463-
// if our match is json encoded, that means it's inside a JSON
464-
// encoded string field.
465-
const isInJSON = subMatches[0].includes(`\\/\\/`)
466-
467-
// we shouldn't process encoded JSON, so skip this match if it's JSON
468-
return !isInJSON
469-
}
470-
)
471-
472-
const imgTagMatches = execall(
473-
/<img([\w\W]+?)[/]?>/gim,
474-
nodeString
475-
// we don't want to match images inside pre
476-
.replace(/<pre([\w\W]+?)[/]?>.*(<\/pre>)/gim, ``)
477-
// and code tags, so temporarily remove those tags and everything inside them
478-
.replace(/<code([\w\W]+?)[/]?>.*(<\/code>)/gim, ``)
479-
).filter(filterMatches(wpUrl))
495+
const imgTagMatches = getImgTagMatchesWithUrl({ nodeString, wpUrl })
480496

481497
if (imageUrlMatches.length && imgTagMatches.length) {
482-
const cheerioImages = imgTagMatches
483-
.map(getCheerioElementFromMatch(wpUrl))
484-
.filter(({ cheerioImg: { attribs } }) => {
485-
if (!attribs.src) {
486-
return false
487-
}
488-
489-
return isWebUri(attribs.src)
490-
})
498+
const cheerioImages = getCheerioElementsFromMatches({
499+
imgTagMatches,
500+
wpUrl,
501+
})
491502

492503
const htmlMatchesToMediaItemNodesMap = await fetchNodeHtmlImageMediaItemNodes(
493504
{

packages/gatsby-source-wordpress/src/steps/source-nodes/fetch-nodes/fetch-referenced-media-items.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,8 @@ export const stripImageSizesFromUrl = url => {
211211
const fileExtension = urlToFileExtension(url)
212212

213213
const imageSizesPattern = new RegExp(
214-
`(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `.${fileExtension}` : ``}`
214+
// eslint-disable-next-line no-useless-escape
215+
`(?:[-_]([0-9]+)x([0-9]+))${fileExtension ? `\.${fileExtension}` : ``}`
215216
)
216217

217218
let urlWithoutSizes = url.replace(imageSizesPattern, ``)

0 commit comments

Comments
 (0)