Skip to content

Commit e52405f

Browse files
Ensure islower/isupper handles strs without chars
This fixes a regression mentioned by CodeRabbit. I also figured out how to check a string's case without allocation using Unicode properties. Thus, this commit removes `icu_casemap` again. `icu_casemap` and my old solution is required for a robust case check, but it seems like the current code is fine for Python.
1 parent 44ef516 commit e52405f

5 files changed

Lines changed: 24 additions & 82 deletions

File tree

Cargo.lock

Lines changed: 0 additions & 55 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,16 +222,13 @@ strum = "0.28"
222222
strum_macros = "0.28"
223223
syn = "2"
224224
thiserror = "2.0"
225-
icu_casemap = "2"
226-
icu_locale = "2"
227225
icu_properties = "2"
228226
icu_normalizer = "2"
229227
unicode-casing = "0.1.1"
230228
unic-ucd-age = "0.9.0"
231229
unicode_names2 = "2.0.0"
232230
widestring = "1.2.0"
233231
windows-sys = "0.61.2"
234-
writeable = "0.6.1"
235232
wasm-bindgen = "0.2.106"
236233

237234
# Lints

crates/vm/Cargo.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,7 @@ timsort = "0.1.2"
8686
# TODO: use unic for this; needed for title case:
8787
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
8888
unicode-casing = { workspace = true }
89-
icu_casemap = { workspace = true }
90-
icu_locale = { workspace = true }
9189
icu_properties = { workspace = true }
92-
writeable = { workspace = true }
9390

9491
[target.'cfg(unix)'.dependencies]
9592
rustix = { workspace = true }

crates/vm/src/anystr.rs

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ use crate::{
44
convert::TryFromBorrowedObject,
55
function::OptionalOption,
66
};
7-
use icu_casemap::CaseMapper;
8-
use icu_locale::LanguageIdentifier;
7+
use icu_properties::{
8+
CodePointSetData,
9+
props::{Alphabetic, ChangesWhenLowercased, ChangesWhenUppercased},
10+
};
911
use num_traits::{cast::ToPrimitive, sign::Signed};
10-
use writeable::Writeable;
1112

1213
use core::ops::Range;
1314

@@ -410,19 +411,17 @@ pub trait AnyStr {
410411
// _Py_bytes_islower
411412
// unicode_islower_impl
412413
fn py_islower(&self) -> bool {
414+
let case_change = CodePointSetData::new::<ChangesWhenLowercased>();
415+
let alphabetic = CodePointSetData::new::<Alphabetic>();
413416
let mut lower = false;
414-
let mut lowercased = String::with_capacity(self.bytes_len());
415-
let cm = CaseMapper::new();
416417
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
417-
let writer = cm.lowercase(chunk, &LanguageIdentifier::UNKNOWN);
418-
lowercased.clear();
419-
writer
420-
.write_to(&mut lowercased)
421-
.expect("Writing to a buffer is infallible");
422-
if chunk != lowercased {
418+
if chunk.chars().any(|c| case_change.contains(c)) {
423419
return false;
424420
}
425-
lower = true;
421+
422+
if !lower && chunk.chars().any(|c| alphabetic.contains(c)) {
423+
lower = true;
424+
}
426425
}
427426
lower
428427
}
@@ -431,19 +430,17 @@ pub trait AnyStr {
431430
// Py_bytes_isupper
432431
// unicode_isupper_impl
433432
fn py_isupper(&self) -> bool {
433+
let case_change = CodePointSetData::new::<ChangesWhenUppercased>();
434+
let alphabetic = CodePointSetData::new::<Alphabetic>();
434435
let mut upper = false;
435-
let mut uppercased = String::with_capacity(self.bytes_len());
436-
let cm = CaseMapper::new();
437436
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
438-
let writer = cm.uppercase(chunk, &LanguageIdentifier::UNKNOWN);
439-
uppercased.clear();
440-
writer
441-
.write_to(&mut uppercased)
442-
.expect("Writing to a buffer is infallible");
443-
if chunk != uppercased {
437+
if chunk.chars().any(|c| case_change.contains(c)) {
444438
return false;
445439
}
446-
upper = true;
440+
441+
if !upper && chunk.chars().any(|c| alphabetic.contains(c)) {
442+
upper = true;
443+
}
447444
}
448445
upper
449446
}

extra_tests/snippets/builtin_str.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,12 @@
222222
assert "abc\t12345\txyz".expandtabs() == "abc 12345 xyz"
223223
assert "-".join(["1", "2", "3"]) == "1-2-3"
224224
assert "HALLO".isupper()
225+
assert not "123".isupper()
226+
assert not "123".islower()
227+
assert not "\U0001f431".isupper()
228+
assert not "\U0001f431".islower()
229+
assert "\U0001f431 CAT".isupper()
230+
assert "\U0001f431 cat".islower()
225231
assert "\u0295".islower()
226232
assert "\u1c89".isupper()
227233
assert "hello, my name is".partition("my ") == ("hello, ", "my ", "name is")

0 commit comments

Comments
 (0)