Skip to content

Commit 76408f0

Browse files
ammkrnfgdorais
andauthored
feat: add case insensitive comparison for strings (ASCII only) (#1283)
Co-authored-by: François G. Dorais <[email protected]>
1 parent 6e89c73 commit 76408f0

File tree

4 files changed

+147
-0
lines changed

4 files changed

+147
-0
lines changed

Batteries/Data/String.lean

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import Batteries.Data.String.AsciiCasing
12
import Batteries.Data.String.Basic
23
import Batteries.Data.String.Lemmas
34
import Batteries.Data.String.Matcher
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/-
2+
Copyright (c) 2025 Christopher Bailey. All rights reserved.
3+
Released under Apache 2.0 license as described in the file LICENSE.
4+
Authors: Christopher Bailey, F. G. Dorais
5+
-/
6+
import Batteries.Data.Char
7+
import Batteries.Data.Char.AsciiCasing
8+
9+
namespace String
10+
11+
/-- Case folding for ASCII characters only.
12+
13+
Alphabetic ASCII characters are mapped to their lowercase form, all other characters are left
14+
unchanged. This agrees with the Unicode case folding algorithm for ASCII characters.
15+
16+
```
17+
#eval "ABC".caseFoldAsciiOnly == "abc" -- true
18+
#eval "x".caseFoldAsciiOnly == "y" -- false
19+
#eval "Àà".caseFoldAsciiOnly == "Àà" -- true
20+
#eval "1$#!".caseFoldAsciiOnly == "1$#!" -- true
21+
```
22+
-/
23+
abbrev caseFoldAsciiOnly (s : String) := s.map Char.caseFoldAsciiOnly
24+
25+
-- TODO: Reimplement with finite iterators/streams when available for `String`.
26+
private partial def beqCaseInsensitiveAsciiOnlyImpl (s₁ s₂ : String) : Bool :=
27+
s₁.length == s₂.length && loop (ToStream.toStream s₁) (ToStream.toStream s₂)
28+
where
29+
loop i₁ i₂ := match Stream.next? i₁, Stream.next? i₂ with
30+
| some (c₁, i₁), some (c₂, i₂) => c₁.beqCaseInsensitiveAsciiOnly c₂ && loop i₁ i₂
31+
| none, none => true
32+
| _, _ => false
33+
34+
/--
35+
Bool-valued comparison of two `String`s for *ASCII*-case insensitive equality.
36+
37+
#eval "abc".beqCaseInsensitiveAsciiOnly "ABC" -- true
38+
#eval "cba".beqCaseInsensitiveAsciiOnly "ABC" -- false
39+
#eval "$".beqCaseInsensitiveAsciiOnly "$" -- true
40+
#eval "a".beqCaseInsensitiveAsciiOnly "b" -- false
41+
#eval "γ".beqCaseInsensitiveAsciiOnly "Γ" -- false
42+
-/
43+
@[implemented_by beqCaseInsensitiveAsciiOnlyImpl]
44+
def beqCaseInsensitiveAsciiOnly (s₁ s₂ : String) : Bool :=
45+
s₁.caseFoldAsciiOnly == s₂.caseFoldAsciiOnly
46+
47+
theorem beqCaseInsensitiveAsciiOnly.eqv : Equivalence (beqCaseInsensitiveAsciiOnly · ·) := {
48+
refl _ := BEq.rfl
49+
trans _ _ := by simp_all [beqCaseInsensitiveAsciiOnly]
50+
symm := by simp_all [beqCaseInsensitiveAsciiOnly]
51+
}
52+
53+
/--
54+
Setoid structure on `String` usig `beqCaseInsensitiveAsciiOnly`
55+
-/
56+
def beqCaseInsensitiveAsciiOnly.isSetoid : Setoid String :=
57+
⟨(beqCaseInsensitiveAsciiOnly · ·), beqCaseInsensitiveAsciiOnly.eqv⟩
58+
59+
-- TODO: Reimplement with finite iterators/streams when available for `String`.
60+
private partial def cmpCaseInsensitiveAsciiOnlyImpl (s₁ s₂ : String) : Ordering :=
61+
loop (ToStream.toStream s₁) (ToStream.toStream s₂)
62+
where
63+
loop i₁ i₂ := match Stream.next? i₁, Stream.next? i₂ with
64+
| some (c₁, i₁), some (c₂, i₂) => c₁.cmpCaseInsensitiveAsciiOnly c₂ |>.then (loop i₁ i₂)
65+
| some _, none => .gt
66+
| none, some _ => .lt
67+
| none, none => .eq
68+
69+
/--
70+
ASCII-case insensitive implementation comparison returning an `Ordering`. Useful for sorting.
71+
72+
```
73+
#eval cmpCaseInsensitiveAsciiOnly "a" "A" -- eq
74+
#eval cmpCaseInsensitiveAsciiOnly "a" "a" -- eq
75+
#eval cmpCaseInsensitiveAsciiOnly "$" "$" -- eq
76+
#eval cmpCaseInsensitiveAsciiOnly "a" "b" -- lt
77+
#eval cmpCaseInsensitiveAsciiOnly "γ" "Γ" -- gt
78+
```
79+
-/
80+
@[implemented_by cmpCaseInsensitiveAsciiOnlyImpl]
81+
def cmpCaseInsensitiveAsciiOnly (s₁ s₂ : String) : Ordering :=
82+
compare s₁.caseFoldAsciiOnly s₂.caseFoldAsciiOnly

Batteries/Data/String/Lemmas.lean

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,26 @@ theorem takeWhileAux_of_valid (p : Char → Bool) : ∀ l m r,
728728
cases p c <;> simp
729729
simpa [← Nat.add_assoc, Nat.add_right_comm] using takeWhileAux_of_valid p (l++[c]) m r
730730

731+
@[simp]
732+
theorem data_eq_nil_iff (s : String) : s.data = [] ↔ s = "" :=
733+
fun h => ext (id h), congrArg data⟩
734+
735+
@[simp]
736+
theorem map_eq_empty_iff (s : String) (f : Char → Char) : (s.map f) = "" ↔ s = "" := by
737+
simp only [map_eq, ← data_eq_nil_iff, List.map_eq_nil_iff]
738+
739+
@[simp]
740+
theorem map_isEmpty_eq_isEmpty (s : String) (f : Char → Char) : (s.map f).isEmpty = s.isEmpty := by
741+
rw [Bool.eq_iff_iff]; simp [isEmpty_iff, map_eq_empty_iff]
742+
743+
@[simp]
744+
theorem length_map (s : String) (f : Char → Char) : (s.map f).length = s.length := by
745+
simp only [length, map_eq, List.length_map]
746+
747+
theorem length_eq_of_map_eq {a b : String} {f g : Char → Char} :
748+
a.map f = b.map g → a.length = b.length := by
749+
intro h; rw [← length_map a f, ← length_map b g, h]
750+
731751
end String
732752

733753
open String

BatteriesTest/String.lean

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import Batteries.Data.String.AsciiCasing
2+
3+
#guard "ABC".caseFoldAsciiOnly == "abc"
4+
#guard "x".caseFoldAsciiOnly != "y"
5+
#guard "Àà".caseFoldAsciiOnly == "Àà"
6+
#guard "1$#!".caseFoldAsciiOnly == "1$#!"
7+
8+
#guard "abc".beqCaseInsensitiveAsciiOnly "ABC" == true
9+
#guard "cba".beqCaseInsensitiveAsciiOnly "ABC" == false
10+
#guard "a".beqCaseInsensitiveAsciiOnly "a" == true
11+
#guard "$".beqCaseInsensitiveAsciiOnly "$" == true
12+
#guard "a".beqCaseInsensitiveAsciiOnly "b" == false
13+
#guard "γ".beqCaseInsensitiveAsciiOnly "Γ" == false
14+
#guard "ä".beqCaseInsensitiveAsciiOnly "Ä" == false
15+
16+
#guard "abc".cmpCaseInsensitiveAsciiOnly "ABC" == .eq
17+
#guard "abc".cmpCaseInsensitiveAsciiOnly "xyz" == .lt
18+
#guard "a".cmpCaseInsensitiveAsciiOnly "a" == .eq
19+
#guard "$".cmpCaseInsensitiveAsciiOnly "$" == .eq
20+
#guard "a__".cmpCaseInsensitiveAsciiOnly "b__" == .lt
21+
#guard "γ".cmpCaseInsensitiveAsciiOnly "Γ" == .gt
22+
#guard "ä".cmpCaseInsensitiveAsciiOnly "Ä" == .gt
23+
24+
#guard [
25+
("", ""),
26+
(" ", " "),
27+
("a", "A"),
28+
("abc", "ABC"),
29+
("aBc", "AbC"),
30+
("123", "123"),
31+
("国际化与本地化", "国际化与本地化"),
32+
("ABC😂🔰🍑xyz", "abc😂🔰🍑XYZ")
33+
].all (fun (a, b) => a.beqCaseInsensitiveAsciiOnly b && a.cmpCaseInsensitiveAsciiOnly b = .eq)
34+
35+
#guard [
36+
("国际化", "国际化与本地化"),
37+
("", " "),
38+
("a", "b"),
39+
("ab", "ba"),
40+
("123", "124"),
41+
("😂", "🍑"),
42+
("🔰🍑", "😂🔰🍑aaa")
43+
] |>.all fun (a, b) =>
44+
a ≠ b && !(a.beqCaseInsensitiveAsciiOnly b) && a.cmpCaseInsensitiveAsciiOnly b != .eq

0 commit comments

Comments
 (0)