Skip to content

Commit 113ba92

Browse files
fix: Handle char expansion in islower, isupper
Closes: #7526 `py_islower` and `py_isupper` need to handle expansions for letter casing. Comparing chars directly can miss edge cases in certain languages. Unfortunately, like the last PR, this allocates to handle potential expansions. I also had to add `icu_casemap` as a dependency. RustPython is already using parts of icu4x so this doesn't add many transitive dependencies.
1 parent d5a90e5 commit 113ba92

File tree

5 files changed

+117
-34
lines changed

5 files changed

+117
-34
lines changed

Cargo.lock

Lines changed: 82 additions & 26 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,13 +222,16 @@ strum = "0.28"
222222
strum_macros = "0.28"
223223
syn = "2"
224224
thiserror = "2.0"
225+
icu_casemap = "2"
226+
icu_locale = "2"
225227
icu_properties = "2"
226228
icu_normalizer = "2"
227229
unicode-casing = "0.1.1"
228230
unic-ucd-age = "0.9.0"
229231
unicode_names2 = "2.0.0"
230232
widestring = "1.2.0"
231233
windows-sys = "0.61.2"
234+
writeable = "0.6.1"
232235
wasm-bindgen = "0.2.106"
233236

234237
# Lints

crates/vm/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,10 @@ timsort = "0.1.2"
8686
# TODO: use unic for this; needed for title case:
8787
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
8888
unicode-casing = { workspace = true }
89+
icu_casemap = { workspace = true }
90+
icu_locale = { workspace = true }
8991
icu_properties = { workspace = true }
92+
writeable = { workspace = true }
9093

9194
[target.'cfg(unix)'.dependencies]
9295
rustix = { workspace = true }

crates/vm/src/anystr.rs

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@ use crate::{
44
convert::TryFromBorrowedObject,
55
function::OptionalOption,
66
};
7+
use icu_casemap::CaseMapper;
8+
use icu_locale::LanguageIdentifier;
79
use num_traits::{cast::ToPrimitive, sign::Signed};
10+
use writeable::Writeable;
811

912
use core::ops::Range;
1013

@@ -408,12 +411,18 @@ pub trait AnyStr {
408411
// unicode_islower_impl
409412
fn py_islower(&self) -> bool {
410413
let mut lower = false;
411-
for c in self.elements() {
412-
if c.is_uppercase() {
414+
let mut lowercased = String::with_capacity(self.bytes_len());
415+
let cm = CaseMapper::new();
416+
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
417+
let writer = cm.lowercase(chunk, &LanguageIdentifier::UNKNOWN);
418+
lowercased.clear();
419+
writer
420+
.write_to(&mut lowercased)
421+
.expect("Writing to a buffer is infallible");
422+
if chunk != lowercased {
413423
return false;
414-
} else if !lower && c.is_lowercase() {
415-
lower = true
416424
}
425+
lower = true;
417426
}
418427
lower
419428
}
@@ -423,12 +432,18 @@ pub trait AnyStr {
423432
// unicode_isupper_impl
424433
fn py_isupper(&self) -> bool {
425434
let mut upper = false;
426-
for c in self.elements() {
427-
if c.is_lowercase() {
435+
let mut uppercased = String::with_capacity(self.bytes_len());
436+
let cm = CaseMapper::new();
437+
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
438+
let writer = cm.uppercase(chunk, &LanguageIdentifier::UNKNOWN);
439+
uppercased.clear();
440+
writer
441+
.write_to(&mut uppercased)
442+
.expect("Writing to a buffer is infallible");
443+
if chunk != uppercased {
428444
return false;
429-
} else if !upper && c.is_uppercase() {
430-
upper = true
431445
}
446+
upper = true;
432447
}
433448
upper
434449
}

extra_tests/snippets/builtin_str.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@
6969
assert not a.isdecimal()
7070
assert not a.isnumeric()
7171
assert a.istitle()
72+
assert "\u1c89".istitle()
73+
# assert "DZ".title() == "Dz"
7274
assert a.isalpha()
7375

7476
s = "1 2 3"
@@ -220,6 +222,8 @@
220222
assert "abc\t12345\txyz".expandtabs() == "abc 12345 xyz"
221223
assert "-".join(["1", "2", "3"]) == "1-2-3"
222224
assert "HALLO".isupper()
225+
assert "\u0295".islower()
226+
assert "\u1c89".isupper()
223227
assert "hello, my name is".partition("my ") == ("hello, ", "my ", "name is")
224228
assert "hello".partition("is") == ("hello", "", "")
225229
assert "hello, my name is".rpartition("is") == ("hello, my name ", "is", "")
@@ -236,6 +240,8 @@
236240
assert not "123".isidentifier()
237241

238242
assert "Σίσυφος".swapcase() == "σΊΣΥΦΟΣ"
243+
assert "\u0295".swapcase() == "\u0295"
244+
assert "\u1c89".swapcase() == "\u1c8a"
239245

240246
# String Formatting
241247
assert "{} {}".format(1, 2) == "1 2"

0 commit comments

Comments
 (0)