Skip to content

Commit ebf6961

Browse files
committed
Fix CR+LF being seen as a break between paragraphs
Closes retextjs/retext-repeated-words#10.
1 parent 72d9f0b commit ebf6961

File tree

5 files changed

+240
-9
lines changed

5 files changed

+240
-9
lines changed

lib/expressions.js

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/plugin/break-implicit-sentences.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
import {toString} from 'nlcst-to-string'
22
import {modifyChildren} from 'unist-util-modify-children'
33

4-
// Two or more new line characters.
5-
import {newLineMulti} from '../expressions.js'
6-
74
// Break a sentence if a white space with more than one new-line is found.
85
export const breakImplicitSentences = modifyChildren(function (
96
child,
@@ -22,7 +19,10 @@ export const breakImplicitSentences = modifyChildren(function (
2219
while (++position < children.length - 1) {
2320
const node = children[position]
2421

25-
if (node.type !== 'WhiteSpaceNode' || !newLineMulti.test(toString(node))) {
22+
if (
23+
node.type !== 'WhiteSpaceNode' ||
24+
toString(node).split(/\r\n|\r|\n/).length < 3
25+
) {
2626
continue
2727
}
2828

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
},
5151
"scripts": {
5252
"prepack": "npm run generate && npm run format",
53+
"fixture": "node script/generate-fixture.js",
5354
"generate": "node script/build-expressions.js",
5455
"format": "remark . -qfo && prettier . -w --loglevel warn && xo --fix",
5556
"test-api": "node --conditions development test/index.js",

script/build-expressions.js

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,6 @@ const reAffixSymbol = new RegExp(
103103
// Match one or more new line characters.
104104
const reNewLine = /^[ \t]*((\r?\n|\r)[\t ]*)+$/
105105

106-
// Match two or more new line characters.
107-
const reNewLineMulti = /^[ \t]*((\r?\n|\r)[\t ]*){2,}$/
108-
109106
// Match sentence-ending markers.
110107
const reTerminalMarker = new RegExp('^((?:' + terminalMarker + ')+)$')
111108

@@ -149,7 +146,6 @@ fs.writeFileSync(
149146
'// This module is generated by `script/build-expressions.js`.',
150147
'export const affixSymbol = ' + reAffixSymbol,
151148
'export const newLine = ' + reNewLine,
152-
'export const newLineMulti = ' + reNewLineMulti,
153149
'export const terminalMarker = ' + reTerminalMarker,
154150
'export const wordSymbolInner = ' + reWordSymbolInner,
155151
'export const numerical = ' + reNumerical,

test/index.js

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,241 @@ test('Ellipsis at sentence-end', async function (t) {
857857
)
858858
})
859859

860+
test('Line endings', function () {
861+
assert.deepEqual(
862+
loose(removePosition(latin.parse('alpha\rbravo'), true)),
863+
{
864+
type: 'RootNode',
865+
children: [
866+
{
867+
type: 'ParagraphNode',
868+
children: [
869+
{
870+
type: 'SentenceNode',
871+
children: [
872+
{
873+
type: 'WordNode',
874+
children: [{type: 'TextNode', value: 'alpha'}]
875+
},
876+
{type: 'WhiteSpaceNode', value: '\r'},
877+
{
878+
type: 'WordNode',
879+
children: [{type: 'TextNode', value: 'bravo'}]
880+
}
881+
]
882+
}
883+
]
884+
}
885+
]
886+
},
887+
'should support a CR line ending as whitespace'
888+
)
889+
890+
assert.deepEqual(
891+
loose(removePosition(latin.parse('alpha\nbravo'), true)),
892+
{
893+
type: 'RootNode',
894+
children: [
895+
{
896+
type: 'ParagraphNode',
897+
children: [
898+
{
899+
type: 'SentenceNode',
900+
children: [
901+
{
902+
type: 'WordNode',
903+
children: [{type: 'TextNode', value: 'alpha'}]
904+
},
905+
{type: 'WhiteSpaceNode', value: '\n'},
906+
{
907+
type: 'WordNode',
908+
children: [{type: 'TextNode', value: 'bravo'}]
909+
}
910+
]
911+
}
912+
]
913+
}
914+
]
915+
},
916+
'should support an LF line ending as whitespace'
917+
)
918+
919+
assert.deepEqual(
920+
loose(removePosition(latin.parse('alpha\r\nbravo'), true)),
921+
{
922+
type: 'RootNode',
923+
children: [
924+
{
925+
type: 'ParagraphNode',
926+
children: [
927+
{
928+
type: 'SentenceNode',
929+
children: [
930+
{
931+
type: 'WordNode',
932+
children: [{type: 'TextNode', value: 'alpha'}]
933+
},
934+
{type: 'WhiteSpaceNode', value: '\r\n'},
935+
{
936+
type: 'WordNode',
937+
children: [{type: 'TextNode', value: 'bravo'}]
938+
}
939+
]
940+
}
941+
]
942+
}
943+
]
944+
},
945+
'should support a CR+LF line ending as whitespace'
946+
)
947+
948+
assert.deepEqual(
949+
loose(removePosition(latin.parse('alpha \r\n\tbravo'), true)),
950+
{
951+
type: 'RootNode',
952+
children: [
953+
{
954+
type: 'ParagraphNode',
955+
children: [
956+
{
957+
type: 'SentenceNode',
958+
children: [
959+
{
960+
type: 'WordNode',
961+
children: [{type: 'TextNode', value: 'alpha'}]
962+
},
963+
{type: 'WhiteSpaceNode', value: ' \r\n\t'},
964+
{
965+
type: 'WordNode',
966+
children: [{type: 'TextNode', value: 'bravo'}]
967+
}
968+
]
969+
}
970+
]
971+
}
972+
]
973+
},
974+
'should support a padded CR+LF line ending as whitespace'
975+
)
976+
977+
assert.deepEqual(
978+
loose(removePosition(latin.parse('alpha\r \t\nbravo'), true)),
979+
{
980+
type: 'RootNode',
981+
children: [
982+
{
983+
type: 'ParagraphNode',
984+
children: [
985+
{
986+
type: 'SentenceNode',
987+
children: [
988+
{
989+
type: 'WordNode',
990+
children: [{type: 'TextNode', value: 'alpha'}]
991+
}
992+
]
993+
}
994+
]
995+
},
996+
{type: 'WhiteSpaceNode', value: '\r \t\n'},
997+
{
998+
type: 'ParagraphNode',
999+
children: [
1000+
{
1001+
type: 'SentenceNode',
1002+
children: [
1003+
{
1004+
type: 'WordNode',
1005+
children: [{type: 'TextNode', value: 'bravo'}]
1006+
}
1007+
]
1008+
}
1009+
]
1010+
}
1011+
]
1012+
},
1013+
'should support CR, whitespace, and then an LF, as a break between paragraphs'
1014+
)
1015+
1016+
assert.deepEqual(
1017+
loose(removePosition(latin.parse('alpha \r \t\rbravo'), true)),
1018+
{
1019+
type: 'RootNode',
1020+
children: [
1021+
{
1022+
type: 'ParagraphNode',
1023+
children: [
1024+
{
1025+
type: 'SentenceNode',
1026+
children: [
1027+
{
1028+
type: 'WordNode',
1029+
children: [{type: 'TextNode', value: 'alpha'}]
1030+
}
1031+
]
1032+
}
1033+
]
1034+
},
1035+
{type: 'WhiteSpaceNode', value: ' \r \t\r'},
1036+
{
1037+
type: 'ParagraphNode',
1038+
children: [
1039+
{
1040+
type: 'SentenceNode',
1041+
children: [
1042+
{
1043+
type: 'WordNode',
1044+
children: [{type: 'TextNode', value: 'bravo'}]
1045+
}
1046+
]
1047+
}
1048+
]
1049+
}
1050+
]
1051+
},
1052+
'should support two CRs with whitespace as a break between paragraphs'
1053+
)
1054+
1055+
assert.deepEqual(
1056+
loose(removePosition(latin.parse('alpha\r\rbravo'), true)),
1057+
{
1058+
type: 'RootNode',
1059+
children: [
1060+
{
1061+
type: 'ParagraphNode',
1062+
children: [
1063+
{
1064+
type: 'SentenceNode',
1065+
children: [
1066+
{
1067+
type: 'WordNode',
1068+
children: [{type: 'TextNode', value: 'alpha'}]
1069+
}
1070+
]
1071+
}
1072+
]
1073+
},
1074+
{type: 'WhiteSpaceNode', value: '\r\r'},
1075+
{
1076+
type: 'ParagraphNode',
1077+
children: [
1078+
{
1079+
type: 'SentenceNode',
1080+
children: [
1081+
{
1082+
type: 'WordNode',
1083+
children: [{type: 'TextNode', value: 'bravo'}]
1084+
}
1085+
]
1086+
}
1087+
]
1088+
}
1089+
]
1090+
},
1091+
'should support two CRs as a break between paragraphs'
1092+
)
1093+
})
1094+
8601095
test('Initial trailing white-space', async function (t) {
8611096
await t.test(
8621097
'should move trailing white-space up to the highest possible level',

0 commit comments

Comments
 (0)