Skip to content

Commit 4f4c5b1

Browse files
authored
ignore: simplify and reuse same implementation across check-ignore and is_ignored API (#10856)
1 parent 76cfe92 commit 4f4c5b1

File tree

6 files changed

+290
-79
lines changed

6 files changed

+290
-79
lines changed

dvc/commands/check_ignore.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,36 @@
1+
from typing import TYPE_CHECKING
2+
13
from dvc.cli import completion, formatter
24
from dvc.cli.command import CmdBase
35
from dvc.cli.utils import append_doc_link
46
from dvc.ui import ui
57

8+
if TYPE_CHECKING:
9+
from dvc.ignore import CheckIgnoreResult
10+
611

712
class CmdCheckIgnore(CmdBase):
813
def __init__(self, args):
914
super().__init__(args)
1015
self.ignore_filter = self.repo.dvcignore
1116

12-
def _show_results(self, result):
17+
def _show_results(self, result: "CheckIgnoreResult"):
1318
if not result.match and not self.args.details:
1419
return
1520

16-
if not result.patterns and not self.args.non_matching:
17-
return
18-
1921
if self.args.details:
20-
patterns = result.patterns or ["::"]
22+
pattern_infos = result.pattern_infos
23+
patterns = [str(pi) for pi in pattern_infos]
24+
if not patterns and self.args.non_matching:
25+
patterns = ["::"]
2126
if not self.args.all:
2227
patterns = patterns[-1:]
2328

2429
for pattern in patterns:
2530
ui.write(pattern, result.file, sep="\t")
2631
else:
2732
ui.write(result.file)
28-
return bool(result.patterns)
33+
return bool(result.pattern_infos)
2934

3035
def _check_one_file(self, target):
3136
result = self.ignore_filter.check_ignore(target)
@@ -77,6 +82,8 @@ def run(self):
7782

7883

7984
def add_parser(subparsers, parent_parser):
85+
import argparse
86+
8087
ADD_HELP = "Check whether files or directories are excluded due to `.dvcignore`."
8188

8289
parser = subparsers.add_parser(
@@ -94,14 +101,7 @@ def add_parser(subparsers, parent_parser):
94101
help="Show the exclude patterns along with each target path.",
95102
)
96103
parser.add_argument(
97-
"-a",
98-
"--all",
99-
action="store_true",
100-
default=False,
101-
help=(
102-
"Include the target paths which don't match any pattern "
103-
"in the `--details` list."
104-
),
104+
"-a", "--all", action="store_true", default=False, help=argparse.SUPPRESS
105105
)
106106
parser.add_argument(
107107
"-n",

dvc/ignore.py

Lines changed: 113 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import functools
12
import os
23
import re
34
from collections.abc import Iterable, Iterator
@@ -41,21 +42,58 @@ def __init__(
4142
]
4243

4344
self.sep = sep
44-
self.pattern_list: list[PatternInfo] = pattern_infos
45+
self.pattern_list: list[PatternInfo] = []
4546
self.dirname = dirname
47+
self.find_matching_pattern = functools.cache(self._find_matching_pattern)
4648

47-
self.regex_pattern_list: list[tuple[str, bool]] = []
48-
for count, pattern in enumerate(pattern_infos):
49-
regex, ignore = GitWildMatchPattern.pattern_to_regex(pattern.patterns)
49+
regex_pattern_list: list[tuple[str, bool, bool, PatternInfo]] = []
50+
for count, pattern_info in enumerate(pattern_infos):
51+
regex, ignore = GitWildMatchPattern.pattern_to_regex(pattern_info.patterns)
5052
if regex is not None and ignore is not None:
53+
self.pattern_list.append(pattern_info)
5154
regex = regex.replace(f"<{_DIR_MARK}>", f"<{_DIR_MARK}{count}>")
52-
self.regex_pattern_list.append((regex, ignore))
55+
regex_pattern_list.append(
56+
(regex, ignore, pattern_info.patterns.endswith("/"), pattern_info)
57+
)
58+
59+
def keyfunc(item: tuple[str, bool, bool, PatternInfo]) -> tuple[bool, bool]:
60+
_, ignore, dir_only_pattern, _ = item
61+
return ignore, dir_only_pattern
5362

54-
self.ignore_spec = [
55-
(ignore, re.compile("|".join(regex for regex, _ in group)))
56-
for ignore, group in groupby(self.regex_pattern_list, lambda x: x[1])
57-
if ignore is not None
63+
self.ignore_spec: list[
64+
tuple[
65+
re.Pattern[str],
66+
bool,
67+
bool,
68+
dict[Optional[str], tuple[str, PatternInfo]],
69+
]
5870
]
71+
self.ignore_spec = []
72+
for (ignore, dir_only_pattern), group in groupby(
73+
regex_pattern_list, key=keyfunc
74+
):
75+
if ignore:
76+
# For performance, we combine all exclude patterns.
77+
# But we still need to figure out which pattern matched which rule,
78+
# (eg: to show in `dvc check-ignore`).
79+
# So, we use named groups and keep a map of group name to pattern.
80+
pattern_map: dict[Optional[str], tuple[str, PatternInfo]] = {
81+
f"rule_{i}": (regex, pi)
82+
for i, (regex, _, _, pi) in enumerate(group)
83+
}
84+
combined_regex = "|".join(
85+
f"(?P<{name}>{regex})" for name, (regex, _) in pattern_map.items()
86+
)
87+
self.ignore_spec.append(
88+
(re.compile(combined_regex), ignore, dir_only_pattern, pattern_map)
89+
)
90+
else:
91+
# unignored patterns are not combined with `|`.
92+
for regex, _, _, pi in group:
93+
pattern_map = {None: (regex, pi)}
94+
self.ignore_spec.append(
95+
(re.compile(regex), ignore, dir_only_pattern, pattern_map)
96+
)
5997

6098
@classmethod
6199
def from_file(cls, path: str, fs: "FileSystem", name: str) -> "Self":
@@ -113,60 +151,78 @@ def matches(
113151
basename: str,
114152
is_dir: bool = False,
115153
details: Literal[True] = ...,
116-
) -> tuple[bool, list[str]]: ...
154+
) -> tuple[bool, list[PatternInfo]]: ...
117155

118156
@overload
119157
def matches(
120-
self, dirname: str, basename: str, is_dir: bool = False, details: bool = False
121-
) -> Union[bool, tuple[bool, list[str]]]: ...
158+
self,
159+
dirname: str,
160+
basename: str,
161+
is_dir: bool = False,
162+
details: bool = False,
163+
) -> Union[bool, tuple[bool, list[PatternInfo]]]: ...
122164

123165
def matches(
124-
self, dirname: str, basename: str, is_dir: bool = False, details: bool = False
125-
) -> Union[bool, tuple[bool, list[str]]]:
166+
self,
167+
dirname: str,
168+
basename: str,
169+
is_dir: bool = False,
170+
details: bool = False,
171+
) -> Union[bool, tuple[bool, list[PatternInfo]]]:
126172
path = self._get_normalize_path(dirname, basename)
127-
if not path:
128-
return (False, []) if details else False
129-
if details:
130-
return self._ignore_details(path, is_dir)
131-
return self.ignore(path, is_dir)
132-
133-
def ignore(self, path: str, is_dir: bool) -> bool:
134-
def matches(pattern, path, is_dir) -> bool:
135-
matches_ = bool(pattern.match(path))
136-
137-
if is_dir:
138-
matches_ |= bool(pattern.match(f"{path}/"))
139-
140-
return matches_
141-
142-
result = False
143-
144-
for ignore, pattern in self.ignore_spec[::-1]:
145-
if matches(pattern, path, is_dir):
146-
result = ignore
147-
break
148-
return result
149-
150-
def _ignore_details(self, path: str, is_dir: bool) -> tuple[bool, list[str]]:
151173
result = False
152-
matched_patterns = []
153-
for (regex, ignore), pattern_info in list(
154-
zip(self.regex_pattern_list, self.pattern_list)
174+
_match: list[PatternInfo] = []
175+
if path:
176+
result, _match = self._ignore(path, is_dir)
177+
return (result, _match) if details else result
178+
179+
def _find_matching_pattern(
180+
self, path: str, is_dir: bool
181+
) -> tuple[bool, list[PatternInfo]]:
182+
paths = [path]
183+
if is_dir and not path.endswith("/"):
184+
paths.append(f"{path}/")
185+
186+
for pattern, ignore, dir_only_pattern, pattern_map in reversed(
187+
self.ignore_spec
155188
):
156-
# skip system pattern
157-
if not pattern_info.file_info:
189+
if dir_only_pattern and not is_dir:
158190
continue
159-
160-
pattern = re.compile(regex)
161-
162-
matches = bool(pattern.match(path))
163-
if is_dir:
164-
matches |= bool(pattern.match(f"{path}/"))
165-
166-
if matches:
167-
matched_patterns.append(pattern_info.file_info)
168-
result = ignore
169-
return result, matched_patterns
191+
for p in paths:
192+
match = pattern.match(p)
193+
if not match:
194+
continue
195+
if ignore:
196+
group_name, _match = next(
197+
(
198+
(name, _match)
199+
for name, _match in match.groupdict().items()
200+
if name.startswith("rule_") and _match is not None
201+
)
202+
)
203+
else:
204+
# unignored patterns are not combined with `|`,
205+
# so there are no groups.
206+
group_name = None
207+
_regex, pattern_info = pattern_map[group_name]
208+
return ignore, [pattern_info]
209+
return False, []
210+
211+
def _ignore(self, path: str, is_dir: bool) -> tuple[bool, list[PatternInfo]]:
212+
parts = path.split("/")
213+
result = False
214+
matches: list[PatternInfo] = []
215+
for i in range(1, len(parts) + 1):
216+
rel_path = "/".join(parts[:i])
217+
result, _matches = self.find_matching_pattern(
218+
rel_path, is_dir or i < len(parts)
219+
)
220+
if i < len(parts) and not result:
221+
continue
222+
matches.extend(_matches)
223+
if result:
224+
break
225+
return result, matches
170226

171227
def __hash__(self) -> int:
172228
return hash(self.dirname + ":" + str(self.pattern_list))
@@ -186,7 +242,7 @@ def __bool__(self) -> bool:
186242
class CheckIgnoreResult(NamedTuple):
187243
file: str
188244
match: bool
189-
patterns: list[str]
245+
pattern_infos: list[PatternInfo]
190246

191247

192248
class DvcIgnoreFilter:
@@ -454,14 +510,14 @@ def check_ignore(self, target: str) -> CheckIgnoreResult:
454510
# NOTE: can only be used in `dvc check-ignore`, see
455511
# https://github.com/iterative/dvc/issues/5046
456512
full_target = self.fs.abspath(target)
457-
matched_patterns: list[str] = []
513+
matched_patterns: list[PatternInfo] = []
458514
ignore = False
459515
if not self._outside_repo(full_target):
460516
dirname, basename = self.fs.split(self.fs.normpath(full_target))
461517
pattern = self._get_trie_pattern(dirname)
462518
if pattern:
463519
ignore, matched_patterns = pattern.matches(
464-
dirname, basename, self.fs.isdir(full_target), True
520+
dirname, basename, self.fs.isdir(full_target), details=True
465521
)
466522
return CheckIgnoreResult(target, ignore, matched_patterns)
467523

dvc/output.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
from dvc_data.hashfile.obj import HashFile
4646
from dvc_data.index import DataIndexKey
4747

48-
from .ignore import DvcIgnoreFilter
48+
from .ignore import CheckIgnoreResult, DvcIgnoreFilter
4949

5050
logger = logger.getChild(__name__)
5151

@@ -267,9 +267,9 @@ def __init__(self, path):
267267

268268

269269
class OutputIsIgnoredError(DvcException):
270-
def __init__(self, match):
271-
lines = "\n".join(match.patterns)
272-
super().__init__(f"Path '{match.file}' is ignored by\n{lines}")
270+
def __init__(self, result: "CheckIgnoreResult"):
271+
pi = result.pattern_infos[-1]
272+
super().__init__(f"Path {result.file!r} is ignored by {pi!s}")
273273

274274

275275
class CheckoutCallback(TqdmCallback):
@@ -1203,8 +1203,9 @@ def _validate_output_path(self, path, stage=None):
12031203
if stage:
12041204
abs_path = os.path.join(stage.wdir, path)
12051205
if self._is_path_dvcignore(abs_path):
1206-
check = stage.repo.dvcignore.check_ignore(abs_path)
1207-
raise self.IsIgnoredError(check)
1206+
check: CheckIgnoreResult = stage.repo.dvcignore.check_ignore(abs_path)
1207+
if check.match:
1208+
raise self.IsIgnoredError(check)
12081209

12091210
def _check_can_merge(self, out):
12101211
if self.protocol != out.protocol:

dvc/pathspec_math.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ class PatternInfo(NamedTuple):
1313
patterns: str
1414
file_info: str
1515

16+
def __str__(self) -> str:
17+
return self.file_info or f":{self.patterns}"
18+
1619

1720
def _not_ignore(rule):
1821
return (True, rule[1:]) if rule.startswith("!") else (False, rule)

tests/func/test_check_ignore.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def test_check_ignore_dir(tmp_dir, dvc, path, ret):
7878

7979

8080
def test_check_ignore_default_dir(tmp_dir, dvc):
81-
assert main(["check-ignore", "-q", ".dvc"]) == 1
81+
assert main(["check-ignore", "-q", ".dvc"]) == 0
8282

8383

8484
def test_check_ignore_out_side_repo(tmp_dir, dvc):
@@ -125,7 +125,6 @@ def test_check_ignore_details_all(tmp_dir, dvc, capsys):
125125

126126
assert main(["check-ignore", "-d", "-a", "foo"]) == 0
127127
out, _ = capsys.readouterr()
128-
assert f"{DvcIgnore.DVCIGNORE_FILE}:1:f*\tfoo\n" in out
129128
assert f"{DvcIgnore.DVCIGNORE_FILE}:2:!foo\tfoo\n" in out
130129

131130

0 commit comments

Comments
 (0)