Skip to content

Commit ca110e6

Browse files
committed
Improve filtering
1 parent 711baee commit ca110e6

File tree

6 files changed

+139
-214
lines changed

6 files changed

+139
-214
lines changed

bin/test-regex-filtering.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
set -euxo pipefail
44

55
export RUST_BACKTRACE=1
6-
export CHRONO_TZ_TIMEZONE_FILTER='(Europe/London)'
6+
export CHRONO_TZ_TIMEZONE_FILTER='Europe/(London|Vaduz)'
77

88
cd chrono-tz/tests/check-regex-filtering
99

chrono-tz-build/src/lib.rs

Lines changed: 63 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,12 @@ fn convert_bad_chars(name: &str) -> String {
7777
// The timezone file contains impls of `Timespans` for all timezones in the
7878
// database. The `Wrap` wrapper in the `timezone_impl` module then implements
7979
// TimeZone for any contained struct that implements `Timespans`.
80-
fn write_timezone_file(timezone_file: &mut File, table: &Table, uncased: bool) -> io::Result<()> {
81-
let zones = table
82-
.zonesets
83-
.keys()
84-
.chain(table.links.keys())
85-
.collect::<BTreeSet<_>>();
80+
fn write_timezone_file(
81+
timezone_file: &mut File,
82+
table: &Table,
83+
zones: &BTreeSet<&str>,
84+
uncased: bool,
85+
) -> io::Result<()> {
8686
writeln!(
8787
timezone_file,
8888
"use core::fmt::{{self, Debug, Display, Formatter}};",
@@ -106,14 +106,14 @@ fn write_timezone_file(timezone_file: &mut File, table: &Table, uncased: bool) -
106106
r#"#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]"#
107107
)?;
108108
writeln!(timezone_file, "pub enum Tz {{")?;
109-
for zone in &zones {
109+
for &zone in zones {
110110
let zone_name = convert_bad_chars(zone);
111111
writeln!(timezone_file, " /// {zone}\n {zone_name},")?;
112112
}
113113
writeln!(timezone_file, "}}")?;
114114

115115
let mut map = phf_codegen::Map::new();
116-
for zone in &zones {
116+
for &zone in zones {
117117
map.entry(zone, format!("Tz::{}", convert_bad_chars(zone)));
118118
}
119119
writeln!(
@@ -126,7 +126,7 @@ fn write_timezone_file(timezone_file: &mut File, table: &Table, uncased: bool) -
126126
if uncased {
127127
writeln!(timezone_file, "use uncased::UncasedStr;\n",)?;
128128
let mut map = phf_codegen::Map::new();
129-
for zone in &zones {
129+
for &zone in zones {
130130
map.entry(
131131
uncased::UncasedStr::new(zone),
132132
format!("Tz::{}", convert_bad_chars(zone)),
@@ -168,7 +168,7 @@ impl FromStr for Tz {{
168168
pub fn name(self) -> &'static str {{
169169
match self {{"
170170
)?;
171-
for zone in &zones {
171+
for &zone in zones {
172172
let zone_name = convert_bad_chars(zone);
173173
writeln!(timezone_file, " Tz::{zone_name} => \"{zone}\",")?;
174174
}
@@ -213,10 +213,11 @@ impl FromStr for Tz {{
213213
"impl TimeSpans for Tz {{
214214
fn timespans(&self) -> FixedTimespanSet {{"
215215
)?;
216-
for zone in &zones {
217-
if table.links.get(zone.as_str()).is_some() {
218-
continue;
219-
}
216+
for zone in zones
217+
.iter()
218+
.map(|&z| table.links.get(z).map(String::as_str).unwrap_or(z))
219+
.collect::<BTreeSet<_>>()
220+
{
220221
let zone_name = convert_bad_chars(zone);
221222
let timespans = table.timespans(zone).unwrap();
222223
writeln!(
@@ -240,9 +241,9 @@ impl FromStr for Tz {{
240241
"
241242
)?;
242243

243-
for zone in &zones {
244+
for &zone in zones {
244245
let zone_name = convert_bad_chars(zone);
245-
let target_name = if let Some(target) = table.links.get(zone.as_str()) {
246+
let target_name = if let Some(target) = table.links.get(zone) {
246247
convert_bad_chars(target)
247248
} else {
248249
zone_name.clone()
@@ -273,7 +274,7 @@ pub static TZ_VARIANTS: [Tz; {num}] = [
273274
",
274275
num = zones.len()
275276
)?;
276-
for zone in &zones {
277+
for &zone in zones {
277278
writeln!(
278279
timezone_file,
279280
" Tz::{zone},",
@@ -286,29 +287,30 @@ pub static TZ_VARIANTS: [Tz; {num}] = [
286287

287288
// Create a file containing nice-looking re-exports such as Europe::London
288289
// instead of having to use chrono_tz::timezones::Europe__London
289-
fn write_directory_file(directory_file: &mut File, table: &Table, version: &str) -> io::Result<()> {
290+
fn write_directory_file(
291+
directory_file: &mut File,
292+
table: &Table,
293+
zones: &BTreeSet<&str>,
294+
version: &str,
295+
) -> io::Result<()> {
296+
writeln!(directory_file, "use crate::timezones::Tz;\n")?;
297+
290298
// expose the underlying IANA TZDB version
291299
writeln!(
292300
directory_file,
293301
"pub const IANA_TZDB_VERSION: &str = \"{version}\";\n"
294302
)?;
303+
295304
// add the `loose' zone definitions first
296-
writeln!(directory_file, "use crate::timezones::Tz;\n")?;
297-
let zones = table
298-
.zonesets
299-
.keys()
300-
.chain(table.links.keys())
301-
.filter(|zone| !zone.contains('/'))
302-
.collect::<BTreeSet<_>>();
303-
for zone in zones {
305+
for &zone in zones.iter().filter(|zone| !zone.contains('/')) {
304306
let zone = convert_bad_chars(zone);
305307
writeln!(directory_file, "pub const {zone}: Tz = Tz::{zone};")?;
306308
}
307309
writeln!(directory_file)?;
308310

309311
// now add the `structured' zone names in submodules
310312
let mut first = true;
311-
for entry in table.structure() {
313+
for entry in parse_zoneinfo::structure::build_tree(zones.iter().copied()) {
312314
if entry.name.contains('/') {
313315
continue;
314316
}
@@ -320,7 +322,7 @@ fn write_directory_file(directory_file: &mut File, table: &Table, version: &str)
320322

321323
let module_name = convert_bad_chars(entry.name);
322324
writeln!(directory_file, "pub mod {module_name} {{")?;
323-
writeln!(directory_file, " use crate::timezones::Tz;\n",)?;
325+
writeln!(directory_file, " use super::*;\n",)?;
324326
for child in entry.children {
325327
let name = match child {
326328
Child::Submodule(name) => name,
@@ -365,112 +367,28 @@ fn write_directory_file(directory_file: &mut File, table: &Table, version: &str)
365367
Ok(())
366368
}
367369

368-
/// Module containing code supporting filter-by-regex feature
369-
///
370-
/// The "GMT" and "UTC" time zones are always included.
370+
/// Checks the `CHRONO_TZ_TIMEZONE_FILTER` environment variable.
371+
/// Converts it to a regex if set. Panics if the regex is not valid, as we want
372+
/// to fail the build if that happens.
371373
#[cfg(feature = "filter-by-regex")]
372-
mod filter {
373-
use std::collections::HashSet;
374-
use std::env;
375-
376-
use regex::Regex;
377-
378-
use crate::{Table, FILTER_ENV_VAR_NAME};
379-
380-
/// Filter `table` by applying [`FILTER_ENV_VAR_NAME`].
381-
pub(crate) fn maybe_filter_timezone_table(table: &mut Table) {
382-
if let Some(filter_regex) = get_filter_regex() {
383-
filter_timezone_table(table, filter_regex);
384-
}
385-
}
386-
387-
/// Checks the `CHRONO_TZ_TIMEZONE_FILTER` environment variable.
388-
/// Converts it to a regex if set. Panics if the regex is not valid, as we want
389-
/// to fail the build if that happens.
390-
fn get_filter_regex() -> Option<Regex> {
391-
match env::var(FILTER_ENV_VAR_NAME) {
392-
Ok(val) => {
393-
let val = val.trim();
394-
if val.is_empty() {
395-
return None;
396-
}
397-
match Regex::new(val) {
374+
fn get_filter_regex() -> Option<regex::Regex> {
375+
match std::env::var(FILTER_ENV_VAR_NAME) {
376+
Ok(val) => {
377+
let val = val.trim();
378+
if val.is_empty() {
379+
return None;
380+
}
381+
match regex::Regex::new(val) {
398382
Ok(regex) => Some(regex),
399383
Err(err) => panic!(
400384
"The value '{val:?}' for environment variable {FILTER_ENV_VAR_NAME} is not a valid regex, err={err}"
401385
),
402386
}
403-
}
404-
Err(env::VarError::NotPresent) => None,
405-
Err(env::VarError::NotUnicode(s)) => panic!(
406-
"The value '{s:?}' for environment variable {FILTER_ENV_VAR_NAME} is not valid Unicode"
407-
),
408-
}
409-
}
410-
411-
/// Insert a new name in the list of names to keep. If the name has 3
412-
/// parts, then also insert the 2-part prefix. If we don't do this we will lose
413-
/// half of Indiana in `directory.rs`. But we *don't* want to keep one-part names,
414-
/// otherwise we will inevitably end up with 'America' and include too much as
415-
/// a consequence.
416-
fn insert_keep_entry(keep: &mut HashSet<String>, new_value: &str) {
417-
let mut parts = new_value.split('/');
418-
if let (Some(p1), Some(p2), Some(_), None) =
419-
(parts.next(), parts.next(), parts.next(), parts.next())
420-
{
421-
keep.insert(format!("{p1}/{p2}"));
422-
}
423-
424-
keep.insert(new_value.to_string());
425-
}
426-
427-
/// Filter `table` by applying `filter_regex`.
428-
fn filter_timezone_table(table: &mut Table, filter_regex: Regex) {
429-
// Compute the transitive closure of things to keep.
430-
// Doing this, instead of just filtering `zonesets` and `links` by the
431-
// regex, helps to keep the `structure()` intact.
432-
let mut keep = HashSet::new();
433-
for (k, v) in &table.links {
434-
if filter_regex.is_match(k) || k == "GMT" || k == "UTC" {
435-
insert_keep_entry(&mut keep, k);
436-
}
437-
if filter_regex.is_match(v) || k == "GMT" || k == "UTC" {
438-
insert_keep_entry(&mut keep, v);
439-
}
440-
}
441-
442-
let mut n = 0;
443-
loop {
444-
let len = keep.len();
445-
446-
for (k, v) in &table.links {
447-
if keep.contains(k) && !keep.contains(v) {
448-
insert_keep_entry(&mut keep, v);
449-
}
450-
if keep.contains(v) && !keep.contains(k) {
451-
insert_keep_entry(&mut keep, k);
452-
}
453-
}
454-
455-
if keep.len() == len {
456-
break;
457-
}
458-
459-
n += 1;
460-
if n == 50 {
461-
println!("cargo:warning=Recursion limit reached while building filter list");
462-
break;
463-
}
464387
}
465-
466-
// Actually do the filtering.
467-
table
468-
.links
469-
.retain(|k, v| keep.contains(k) || keep.contains(v));
470-
471-
table
472-
.zonesets
473-
.retain(|k, _| filter_regex.is_match(k) || keep.iter().any(|s| k.starts_with(s)));
388+
Err(env::VarError::NotPresent) => None,
389+
Err(env::VarError::NotUnicode(s)) => panic!(
390+
"The value '{s:?}' for environment variable {FILTER_ENV_VAR_NAME} is not valid Unicode"
391+
),
474392
}
475393
}
476394

@@ -495,7 +413,7 @@ fn detect_iana_db_version() -> String {
495413
unreachable!("no version found")
496414
}
497415

498-
pub fn main(dir: &Path, _filter: bool, _uncased: bool) {
416+
pub fn main(dir: &Path, _filter: bool, uncased: bool) {
499417
let mut table = TableBuilder::new();
500418

501419
let root = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap_or_else(|_| String::new()));
@@ -509,19 +427,29 @@ pub fn main(dir: &Path, _filter: bool, _uncased: bool) {
509427
}
510428
}
511429

512-
#[allow(unused_mut)]
513-
let mut table = table.build();
430+
let table = table.build();
431+
514432
#[cfg(feature = "filter-by-regex")]
515-
if _filter {
516-
filter::maybe_filter_timezone_table(&mut table);
517-
}
433+
let regex = _filter.then(get_filter_regex).flatten();
434+
#[cfg(feature = "filter-by-regex")]
435+
let filter = |tz: &str| regex.as_ref().is_none_or(|r| r.is_match(tz));
436+
#[cfg(not(feature = "filter-by-regex"))]
437+
let filter = |_: &str| true;
438+
439+
let zones = table
440+
.zonesets
441+
.keys()
442+
.chain(table.links.keys())
443+
.filter(|s| filter(s))
444+
.map(String::as_str)
445+
.collect::<BTreeSet<_>>();
518446

519447
let timezone_path = dir.join("timezones.rs");
520448
let mut timezone_file = File::create(timezone_path).unwrap();
521-
write_timezone_file(&mut timezone_file, &table, _uncased).unwrap();
449+
write_timezone_file(&mut timezone_file, &table, &zones, uncased).unwrap();
522450

523451
let directory_path = dir.join("directory.rs");
524452
let mut directory_file = File::create(directory_path).unwrap();
525453
let version = detect_iana_db_version();
526-
write_directory_file(&mut directory_file, &table, &version).unwrap();
454+
write_directory_file(&mut directory_file, &table, &zones, &version).unwrap();
527455
}

0 commit comments

Comments
 (0)