Compare commits

...

8 Commits

Author SHA1 Message Date
Kim Morrison
ca5c04042c fix 2024-06-11 09:52:12 +10:00
Kim Morrison
b1c9666760 rename to utf8Size 2024-06-11 09:43:07 +10:00
Kim Morrison
e6c6ce64a7 fix doc-string 2024-06-11 09:43:07 +10:00
Kim Morrison
01a4280088 Add Char.size_eq 2024-06-11 09:43:07 +10:00
Kim Morrison
522bdcf6a3 leave utf8Size as a UInt32 2024-06-11 09:43:07 +10:00
Kim Morrison
ac1156cffa Apply suggestions from code review
Co-authored-by: Mac Malone <tydeu@hatpress.net>
2024-06-11 09:43:07 +10:00
Kim Morrison
a22a570d41 finish deprecations 2024-06-11 09:43:07 +10:00
Kim Morrison
9177e0d1c8 wip 2024-06-11 09:43:07 +10:00
4 changed files with 38 additions and 39 deletions

View File

@@ -22,13 +22,15 @@ protected theorem le_total (a b : Char) : a ≤ b b ≤ a := UInt32.le_total
protected theorem lt_asymm {a b : Char} (h : a < b) : ¬ b < a := UInt32.lt_asymm h
protected theorem ne_of_lt {a b : Char} (h : a < b) : a b := Char.ne_of_val_ne (UInt32.ne_of_lt h)
theorem utf8Size_pos (c : Char) : 0 < c.utf8Size := by
simp only [utf8Size]
repeat (split; decide)
decide
theorem utf8Size_eq (c : Char) : c.utf8Size = 1 c.utf8Size = 2 c.utf8Size = 3 c.utf8Size = 4 := by
have := c.utf8Size_pos
have := c.utf8Size_le_four
omega
@[simp] theorem ofNat_toNat (c : Char) : Char.ofNat c.toNat = c := by
rw [Char.ofNat, dif_pos]
rfl
end Char
@[deprecated Char.utf8Size (since := "2024-06-04")] abbrev String.csize := Char.utf8Size

View File

@@ -305,15 +305,20 @@ def next' (s : @& String) (p : @& Pos) (h : ¬ s.atEnd p) : Pos :=
let c := get s p
p + c
theorem one_le_csize (c : Char) : 1 csize c := by
repeat first | apply iteInduction (motive := (1 UInt32.toNat ·)) <;> intros | decide
theorem _root_.Char.utf8Size_pos (c : Char) : 0 < c.utf8Size := by
repeat first | apply iteInduction (motive := (0 < ·)) <;> intros | decide
theorem _root_.Char.utf8Size_le_four (c : Char) : c.utf8Size 4 := by
repeat first | apply iteInduction (motive := (· 4)) <;> intros | decide
@[deprecated Char.utf8Size_pos (since := "2026-06-04")] abbrev one_le_csize := Char.utf8Size_pos
@[simp] theorem pos_lt_eq (p₁ p₂ : Pos) : (p₁ < p₂) = (p₁.1 < p₂.1) := rfl
@[simp] theorem pos_add_char (p : Pos) (c : Char) : (p + c).byteIdx = p.byteIdx + csize c := rfl
@[simp] theorem pos_add_char (p : Pos) (c : Char) : (p + c).byteIdx = p.byteIdx + c.utf8Size := rfl
theorem lt_next (s : String) (i : Pos) : i.1 < (s.next i).1 :=
Nat.add_lt_add_left (one_le_csize _) _
Nat.add_lt_add_left (Char.utf8Size_pos _) _
theorem utf8PrevAux_lt_of_pos : (cs : List Char) (i p : Pos), p 0
(utf8PrevAux cs i p).1 < p.1
@@ -323,7 +328,7 @@ theorem utf8PrevAux_lt_of_pos : ∀ (cs : List Char) (i p : Pos), p ≠ 0 →
| c::cs, i, p, h => by
simp [utf8PrevAux]
apply iteInduction (motive := (Pos.byteIdx · < _)) <;> intro h'
next => exact h' Nat.add_lt_add_left (one_le_csize _) _
next => exact h' Nat.add_lt_add_left (Char.utf8Size_pos _) _
next => exact utf8PrevAux_lt_of_pos _ _ _ h
theorem prev_lt_of_pos (s : String) (i : Pos) (h : i 0) : (s.prev i).1 < i.1 := by
@@ -476,7 +481,7 @@ decreasing_by
focus
rename_i i₀ j₀ _ eq h'
rw [show (s.next i₀ - sep.next j₀).1 = (i₀ - j₀).1 by
show (_ + csize _) - (_ + csize _) = _
show (_ + Char.utf8Size _) - (_ + Char.utf8Size _) = _
rw [(beq_iff_eq ..).1 eq, Nat.add_sub_add_right]; rfl]
right; exact Nat.sub_lt_sub_left
(Nat.lt_of_le_of_lt (Nat.le_add_right ..) (Nat.gt_of_not_le (mt decide_eq_true h')))
@@ -724,18 +729,18 @@ theorem set_next_add (s : String) (i : Pos) (c : Char) (b₁ b₂)
simp [next, get, set, endPos, utf8ByteSize] at h
rw [Nat.add_comm i.1, Nat.add_assoc] at h
let rec foo : cs a b₁ b₂,
csize (utf8GetAux cs a i) + b₁ = utf8ByteSize.go cs + b₂
csize (utf8GetAux (utf8SetAux c cs a i) a i) + b₁ = utf8ByteSize.go (utf8SetAux c cs a i) + b₂
(utf8GetAux cs a i).utf8Size + b₁ = utf8ByteSize.go cs + b₂
(utf8GetAux (utf8SetAux c cs a i) a i).utf8Size + b₁ = utf8ByteSize.go (utf8SetAux c cs a i) + b₂
| [], _, _, _, h => h
| c'::cs, a, b₁, b₂, h => by
unfold utf8SetAux
apply iteInduction (motive := fun p => csize (utf8GetAux p a i) + b₁ = utf8ByteSize.go p + b₂) <;>
apply iteInduction (motive := fun p => (utf8GetAux p a i).utf8Size + b₁ = utf8ByteSize.go p + b₂) <;>
intro h' <;> simp [utf8GetAux, h', utf8ByteSize.go] at h
next =>
rw [Nat.add_assoc, Nat.add_left_comm] at h ; rw [Nat.add_left_cancel h]
next =>
rw [Nat.add_assoc] at h
refine foo cs (a + c') b₁ (csize c' + b₂) h
refine foo cs (a + c') b₁ (c'.utf8Size + b₂) h
exact foo s.1 0 _ _ h
theorem mapAux_lemma (s : String) (i : Pos) (c : Char) (h : ¬s.atEnd i) :
@@ -788,7 +793,7 @@ where
else true
termination_by stop1.1 - off1.1
decreasing_by
have := Nat.sub_lt_sub_left _h (Nat.add_lt_add_left (one_le_csize c₁) off1.1)
have := Nat.sub_lt_sub_left _h (Nat.add_lt_add_left c₁.utf8Size_pos off1.1)
decreasing_tactic
/-- Return true iff `p` is a prefix of `s` -/
@@ -1136,14 +1141,14 @@ theorem add_eq (p₁ p₂ : Pos) : p₁ + p₂ = ⟨p₁.byteIdx + p₂.byteIdx
theorem sub_eq (p₁ p₂ : Pos) : p₁ - p₂ = p₁.byteIdx - p₂.byteIdx := rfl
@[simp] theorem addChar_byteIdx (p : Pos) (c : Char) : (p + c).byteIdx = p.byteIdx + csize c := rfl
@[simp] theorem addChar_byteIdx (p : Pos) (c : Char) : (p + c).byteIdx = p.byteIdx + c.utf8Size := rfl
theorem addChar_eq (p : Pos) (c : Char) : p + c = p.byteIdx + csize c := rfl
theorem addChar_eq (p : Pos) (c : Char) : p + c = p.byteIdx + c.utf8Size := rfl
theorem zero_addChar_byteIdx (c : Char) : ((0 : Pos) + c).byteIdx = csize c := by
theorem zero_addChar_byteIdx (c : Char) : ((0 : Pos) + c).byteIdx = c.utf8Size := by
simp only [addChar_byteIdx, byteIdx_zero, Nat.zero_add]
theorem zero_addChar_eq (c : Char) : (0 : Pos) + c = csize c := by rw [ zero_addChar_byteIdx]
theorem zero_addChar_eq (c : Char) : (0 : Pos) + c = c.utf8Size := by rw [ zero_addChar_byteIdx]
theorem addChar_right_comm (p : Pos) (c₁ c₂ : Char) : p + c₁ + c₂ = p + c₂ + c₁ := by
apply ext

View File

@@ -63,10 +63,10 @@ where
loop (i : Nat) : Option Unit := do
if i < a.size then
let c utf8DecodeChar? a i
loop (i + csize c)
loop (i + c.utf8Size)
else pure ()
termination_by a.size - i
decreasing_by exact Nat.sub_lt_sub_left _ (Nat.lt_add_of_pos_right (one_le_csize c))
decreasing_by exact Nat.sub_lt_sub_left _ (Nat.lt_add_of_pos_right c.utf8Size_pos)
/-- Converts a [UTF-8](https://en.wikipedia.org/wiki/UTF-8) encoded `ByteArray` string to `String`. -/
@[extern "lean_string_from_utf8"]
@@ -76,10 +76,10 @@ where
loop (i : Nat) (acc : String) : String :=
if i < a.size then
let c := (utf8DecodeChar? a i).getD default
loop (i + csize c) (acc.push c)
loop (i + c.utf8Size) (acc.push c)
else acc
termination_by a.size - i
decreasing_by exact Nat.sub_lt_sub_left _ (Nat.lt_add_of_pos_right (one_le_csize c))
decreasing_by exact Nat.sub_lt_sub_left _ (Nat.lt_add_of_pos_right c.utf8Size_pos)
/-- Converts a [UTF-8](https://en.wikipedia.org/wiki/UTF-8) encoded `ByteArray` string to `String`,
or returns `none` if `a` is not properly UTF-8 encoded. -/
@@ -108,8 +108,8 @@ def utf8EncodeChar (c : Char) : List UInt8 :=
(v >>> 6).toUInt8 &&& 0x3f ||| 0x80,
v.toUInt8 &&& 0x3f ||| 0x80]
@[simp] theorem length_utf8EncodeChar (c : Char) : (utf8EncodeChar c).length = csize c := by
simp [csize, utf8EncodeChar, Char.utf8Size]
@[simp] theorem length_utf8EncodeChar (c : Char) : (utf8EncodeChar c).length = c.utf8Size := by
simp [Char.utf8Size, utf8EncodeChar]
cases Decidable.em (c.val 0x7f) <;> simp [*]
cases Decidable.em (c.val 0x7ff) <;> simp [*]
cases Decidable.em (c.val 0xffff) <;> simp [*]

View File

@@ -2196,15 +2196,11 @@ instance : DecidableEq Char :=
| isFalse h => isFalse (Char.ne_of_val_ne h)
/-- Returns the number of bytes required to encode this `Char` in UTF-8. -/
def Char.utf8Size (c : Char) : UInt32 :=
def Char.utf8Size (c : Char) : Nat :=
let v := c.val
ite (LE.le v (UInt32.ofNatCore 0x7F (by decide)))
(UInt32.ofNatCore 1 (by decide))
(ite (LE.le v (UInt32.ofNatCore 0x7FF (by decide)))
(UInt32.ofNatCore 2 (by decide))
(ite (LE.le v (UInt32.ofNatCore 0xFFFF (by decide)))
(UInt32.ofNatCore 3 (by decide))
(UInt32.ofNatCore 4 (by decide))))
ite (LE.le v (UInt32.ofNatCore 0x7F (by decide))) 1
(ite (LE.le v (UInt32.ofNatCore 0x7FF (by decide))) 2
(ite (LE.le v (UInt32.ofNatCore 0xFFFF (by decide))) 3 4))
/--
`Option α` is the type of values which are either `some a` for some `a : α`,
@@ -2433,10 +2429,6 @@ instance : Inhabited Substring where
@[inline] def Substring.bsize : Substring Nat
| _, b, e => e.byteIdx.sub b.byteIdx
/-- Returns the number of bytes required to encode this `Char` in UTF-8. -/
def String.csize (c : Char) : Nat :=
c.utf8Size.toNat
/--
The UTF-8 byte length of this string.
This is overridden by the compiler to be cached and O(1).
@@ -2447,7 +2439,7 @@ def String.utf8ByteSize : (@& String) → Nat
where
go : List Char Nat
| .nil => 0
| .cons c cs => hAdd (go cs) (csize c)
| .cons c cs => hAdd (go cs) c.utf8Size
instance : HAdd String.Pos String.Pos String.Pos where
hAdd p₁ p₂ := { byteIdx := hAdd p₁.byteIdx p₂.byteIdx }
@@ -2456,7 +2448,7 @@ instance : HSub String.Pos String.Pos String.Pos where
hSub p₁ p₂ := { byteIdx := HSub.hSub p₁.byteIdx p₂.byteIdx }
instance : HAdd String.Pos Char String.Pos where
hAdd p c := { byteIdx := hAdd p.byteIdx (String.csize c) }
hAdd p c := { byteIdx := hAdd p.byteIdx c.utf8Size }
instance : HAdd String.Pos String String.Pos where
hAdd p s := { byteIdx := hAdd p.byteIdx s.utf8ByteSize }