1
+ import functools
1
2
import os
2
3
import re
3
4
from collections .abc import Iterable , Iterator
@@ -41,21 +42,58 @@ def __init__(
41
42
]
42
43
43
44
self .sep = sep
44
- self .pattern_list : list [PatternInfo ] = pattern_infos
45
+ self .pattern_list : list [PatternInfo ] = []
45
46
self .dirname = dirname
47
+ self .find_matching_pattern = functools .cache (self ._find_matching_pattern )
46
48
47
- self . regex_pattern_list : list [tuple [str , bool ]] = []
48
- for count , pattern in enumerate (pattern_infos ):
49
- regex , ignore = GitWildMatchPattern .pattern_to_regex (pattern .patterns )
49
+ regex_pattern_list : list [tuple [str , bool , bool , PatternInfo ]] = []
50
+ for count , pattern_info in enumerate (pattern_infos ):
51
+ regex , ignore = GitWildMatchPattern .pattern_to_regex (pattern_info .patterns )
50
52
if regex is not None and ignore is not None :
53
+ self .pattern_list .append (pattern_info )
51
54
regex = regex .replace (f"<{ _DIR_MARK } >" , f"<{ _DIR_MARK } { count } >" )
52
- self .regex_pattern_list .append ((regex , ignore ))
55
+ regex_pattern_list .append (
56
+ (regex , ignore , pattern_info .patterns .endswith ("/" ), pattern_info )
57
+ )
58
+
59
+ def keyfunc (item : tuple [str , bool , bool , PatternInfo ]) -> tuple [bool , bool ]:
60
+ _ , ignore , dir_only_pattern , _ = item
61
+ return ignore , dir_only_pattern
53
62
54
- self .ignore_spec = [
55
- (ignore , re .compile ("|" .join (regex for regex , _ in group )))
56
- for ignore , group in groupby (self .regex_pattern_list , lambda x : x [1 ])
57
- if ignore is not None
63
+ self .ignore_spec : list [
64
+ tuple [
65
+ re .Pattern [str ],
66
+ bool ,
67
+ bool ,
68
+ dict [Optional [str ], tuple [str , PatternInfo ]],
69
+ ]
58
70
]
71
+ self .ignore_spec = []
72
+ for (ignore , dir_only_pattern ), group in groupby (
73
+ regex_pattern_list , key = keyfunc
74
+ ):
75
+ if ignore :
76
+ # For performance, we combine all exclude patterns.
77
+ # But we still need to figure out which pattern matched which rule,
78
+ # (eg: to show in `dvc check-ignore`).
79
+ # So, we use named groups and keep a map of group name to pattern.
80
+ pattern_map : dict [Optional [str ], tuple [str , PatternInfo ]] = {
81
+ f"rule_{ i } " : (regex , pi )
82
+ for i , (regex , _ , _ , pi ) in enumerate (group )
83
+ }
84
+ combined_regex = "|" .join (
85
+ f"(?P<{ name } >{ regex } )" for name , (regex , _ ) in pattern_map .items ()
86
+ )
87
+ self .ignore_spec .append (
88
+ (re .compile (combined_regex ), ignore , dir_only_pattern , pattern_map )
89
+ )
90
+ else :
91
+ # unignored patterns are not combined with `|`.
92
+ for regex , _ , _ , pi in group :
93
+ pattern_map = {None : (regex , pi )}
94
+ self .ignore_spec .append (
95
+ (re .compile (regex ), ignore , dir_only_pattern , pattern_map )
96
+ )
59
97
60
98
@classmethod
61
99
def from_file (cls , path : str , fs : "FileSystem" , name : str ) -> "Self" :
@@ -113,60 +151,78 @@ def matches(
113
151
basename : str ,
114
152
is_dir : bool = False ,
115
153
details : Literal [True ] = ...,
116
- ) -> tuple [bool , list [str ]]: ...
154
+ ) -> tuple [bool , list [PatternInfo ]]: ...
117
155
118
156
@overload
119
157
def matches (
120
- self , dirname : str , basename : str , is_dir : bool = False , details : bool = False
121
- ) -> Union [bool , tuple [bool , list [str ]]]: ...
158
+ self ,
159
+ dirname : str ,
160
+ basename : str ,
161
+ is_dir : bool = False ,
162
+ details : bool = False ,
163
+ ) -> Union [bool , tuple [bool , list [PatternInfo ]]]: ...
122
164
123
165
def matches (
124
- self , dirname : str , basename : str , is_dir : bool = False , details : bool = False
125
- ) -> Union [bool , tuple [bool , list [str ]]]:
166
+ self ,
167
+ dirname : str ,
168
+ basename : str ,
169
+ is_dir : bool = False ,
170
+ details : bool = False ,
171
+ ) -> Union [bool , tuple [bool , list [PatternInfo ]]]:
126
172
path = self ._get_normalize_path (dirname , basename )
127
- if not path :
128
- return (False , []) if details else False
129
- if details :
130
- return self ._ignore_details (path , is_dir )
131
- return self .ignore (path , is_dir )
132
-
133
- def ignore (self , path : str , is_dir : bool ) -> bool :
134
- def matches (pattern , path , is_dir ) -> bool :
135
- matches_ = bool (pattern .match (path ))
136
-
137
- if is_dir :
138
- matches_ |= bool (pattern .match (f"{ path } /" ))
139
-
140
- return matches_
141
-
142
- result = False
143
-
144
- for ignore , pattern in self .ignore_spec [::- 1 ]:
145
- if matches (pattern , path , is_dir ):
146
- result = ignore
147
- break
148
- return result
149
-
150
- def _ignore_details (self , path : str , is_dir : bool ) -> tuple [bool , list [str ]]:
151
173
result = False
152
- matched_patterns = []
153
- for (regex , ignore ), pattern_info in list (
154
- zip (self .regex_pattern_list , self .pattern_list )
174
+ _match : list [PatternInfo ] = []
175
+ if path :
176
+ result , _match = self ._ignore (path , is_dir )
177
+ return (result , _match ) if details else result
178
+
179
+ def _find_matching_pattern (
180
+ self , path : str , is_dir : bool
181
+ ) -> tuple [bool , list [PatternInfo ]]:
182
+ paths = [path ]
183
+ if is_dir and not path .endswith ("/" ):
184
+ paths .append (f"{ path } /" )
185
+
186
+ for pattern , ignore , dir_only_pattern , pattern_map in reversed (
187
+ self .ignore_spec
155
188
):
156
- # skip system pattern
157
- if not pattern_info .file_info :
189
+ if dir_only_pattern and not is_dir :
158
190
continue
159
-
160
- pattern = re .compile (regex )
161
-
162
- matches = bool (pattern .match (path ))
163
- if is_dir :
164
- matches |= bool (pattern .match (f"{ path } /" ))
165
-
166
- if matches :
167
- matched_patterns .append (pattern_info .file_info )
168
- result = ignore
169
- return result , matched_patterns
191
+ for p in paths :
192
+ match = pattern .match (p )
193
+ if not match :
194
+ continue
195
+ if ignore :
196
+ group_name , _match = next (
197
+ (
198
+ (name , _match )
199
+ for name , _match in match .groupdict ().items ()
200
+ if name .startswith ("rule_" ) and _match is not None
201
+ )
202
+ )
203
+ else :
204
+ # unignored patterns are not combined with `|`,
205
+ # so there are no groups.
206
+ group_name = None
207
+ _regex , pattern_info = pattern_map [group_name ]
208
+ return ignore , [pattern_info ]
209
+ return False , []
210
+
211
+ def _ignore (self , path : str , is_dir : bool ) -> tuple [bool , list [PatternInfo ]]:
212
+ parts = path .split ("/" )
213
+ result = False
214
+ matches : list [PatternInfo ] = []
215
+ for i in range (1 , len (parts ) + 1 ):
216
+ rel_path = "/" .join (parts [:i ])
217
+ result , _matches = self .find_matching_pattern (
218
+ rel_path , is_dir or i < len (parts )
219
+ )
220
+ if i < len (parts ) and not result :
221
+ continue
222
+ matches .extend (_matches )
223
+ if result :
224
+ break
225
+ return result , matches
170
226
171
227
def __hash__ (self ) -> int :
172
228
return hash (self .dirname + ":" + str (self .pattern_list ))
@@ -186,7 +242,7 @@ def __bool__(self) -> bool:
186
242
class CheckIgnoreResult (NamedTuple ):
187
243
file : str
188
244
match : bool
189
- patterns : list [str ]
245
+ pattern_infos : list [PatternInfo ]
190
246
191
247
192
248
class DvcIgnoreFilter :
@@ -454,14 +510,14 @@ def check_ignore(self, target: str) -> CheckIgnoreResult:
454
510
# NOTE: can only be used in `dvc check-ignore`, see
455
511
# https://github.com/iterative/dvc/issues/5046
456
512
full_target = self .fs .abspath (target )
457
- matched_patterns : list [str ] = []
513
+ matched_patterns : list [PatternInfo ] = []
458
514
ignore = False
459
515
if not self ._outside_repo (full_target ):
460
516
dirname , basename = self .fs .split (self .fs .normpath (full_target ))
461
517
pattern = self ._get_trie_pattern (dirname )
462
518
if pattern :
463
519
ignore , matched_patterns = pattern .matches (
464
- dirname , basename , self .fs .isdir (full_target ), True
520
+ dirname , basename , self .fs .isdir (full_target ), details = True
465
521
)
466
522
return CheckIgnoreResult (target , ignore , matched_patterns )
467
523
0 commit comments