Skip to content

Commit 712bd75

Browse files
Ensure islower/isupper handles strs without chars
This fixes a regression mentioned by CodeRabbit. I also figured out how to check a string's case without allocation using Unicode properties. Thus, this commit removes `icu_casemap` again. `icu_casemap` and my old solution is required for a robust case check, but it seems like the current code is fine for Python.
1 parent 44ef516 commit 712bd75

File tree

7 files changed

+24
-114
lines changed

7 files changed

+24
-114
lines changed

Cargo.lock

Lines changed: 0 additions & 55 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -222,16 +222,13 @@ strum = "0.28"
222222
strum_macros = "0.28"
223223
syn = "2"
224224
thiserror = "2.0"
225-
icu_casemap = "2"
226-
icu_locale = "2"
227225
icu_properties = "2"
228226
icu_normalizer = "2"
229227
unicode-casing = "0.1.1"
230228
unic-ucd-age = "0.9.0"
231229
unicode_names2 = "2.0.0"
232230
widestring = "1.2.0"
233231
windows-sys = "0.61.2"
234-
writeable = "0.6.1"
235232
wasm-bindgen = "0.2.106"
236233

237234
# Lints

crates/vm/Cargo.toml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,10 +86,7 @@ timsort = "0.1.2"
8686
# TODO: use unic for this; needed for title case:
8787
# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939
8888
unicode-casing = { workspace = true }
89-
icu_casemap = { workspace = true }
90-
icu_locale = { workspace = true }
9189
icu_properties = { workspace = true }
92-
writeable = { workspace = true }
9390

9491
[target.'cfg(unix)'.dependencies]
9592
rustix = { workspace = true }

crates/vm/src/anystr.rs

Lines changed: 18 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ use crate::{
44
convert::TryFromBorrowedObject,
55
function::OptionalOption,
66
};
7-
use icu_casemap::CaseMapper;
8-
use icu_locale::LanguageIdentifier;
7+
use icu_properties::{
8+
CodePointSetData,
9+
props::{Alphabetic, ChangesWhenLowercased, ChangesWhenUppercased},
10+
};
911
use num_traits::{cast::ToPrimitive, sign::Signed};
10-
use writeable::Writeable;
1112

1213
use core::ops::Range;
1314

@@ -133,8 +134,6 @@ where
133134
}
134135

135136
pub trait AnyChar: Copy {
136-
fn is_lowercase(self) -> bool;
137-
fn is_uppercase(self) -> bool;
138137
fn bytes_len(self) -> usize;
139138
}
140139

@@ -410,19 +409,17 @@ pub trait AnyStr {
410409
// _Py_bytes_islower
411410
// unicode_islower_impl
412411
fn py_islower(&self) -> bool {
412+
let case_change = CodePointSetData::new::<ChangesWhenLowercased>();
413+
let alphabetic = CodePointSetData::new::<Alphabetic>();
413414
let mut lower = false;
414-
let mut lowercased = String::with_capacity(self.bytes_len());
415-
let cm = CaseMapper::new();
416415
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
417-
let writer = cm.lowercase(chunk, &LanguageIdentifier::UNKNOWN);
418-
lowercased.clear();
419-
writer
420-
.write_to(&mut lowercased)
421-
.expect("Writing to a buffer is infallible");
422-
if chunk != lowercased {
416+
if chunk.chars().any(|c| case_change.contains(c)) {
423417
return false;
424418
}
425-
lower = true;
419+
420+
if !lower && chunk.chars().any(|c| alphabetic.contains(c)) {
421+
lower = true;
422+
}
426423
}
427424
lower
428425
}
@@ -431,19 +428,17 @@ pub trait AnyStr {
431428
// Py_bytes_isupper
432429
// unicode_isupper_impl
433430
fn py_isupper(&self) -> bool {
431+
let case_change = CodePointSetData::new::<ChangesWhenUppercased>();
432+
let alphabetic = CodePointSetData::new::<Alphabetic>();
434433
let mut upper = false;
435-
let mut uppercased = String::with_capacity(self.bytes_len());
436-
let cm = CaseMapper::new();
437434
for chunk in self.as_bytes().utf8_chunks().map(|c| c.valid()) {
438-
let writer = cm.uppercase(chunk, &LanguageIdentifier::UNKNOWN);
439-
uppercased.clear();
440-
writer
441-
.write_to(&mut uppercased)
442-
.expect("Writing to a buffer is infallible");
443-
if chunk != uppercased {
435+
if chunk.chars().any(|c| case_change.contains(c)) {
444436
return false;
445437
}
446-
upper = true;
438+
439+
if !upper && chunk.chars().any(|c| alphabetic.contains(c)) {
440+
upper = true;
441+
}
447442
}
448443
upper
449444
}

crates/vm/src/builtins/str.rs

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -2228,14 +2228,6 @@ impl AnyStrContainer<str> for String {
22282228
}
22292229

22302230
impl anystr::AnyChar for char {
2231-
fn is_lowercase(self) -> bool {
2232-
self.is_lowercase()
2233-
}
2234-
2235-
fn is_uppercase(self) -> bool {
2236-
self.is_uppercase()
2237-
}
2238-
22392231
fn bytes_len(self) -> usize {
22402232
self.len_utf8()
22412233
}
@@ -2341,12 +2333,6 @@ impl AnyStrContainer<Wtf8> for Wtf8Buf {
23412333
}
23422334

23432335
impl anystr::AnyChar for CodePoint {
2344-
fn is_lowercase(self) -> bool {
2345-
self.is_char_and(char::is_lowercase)
2346-
}
2347-
fn is_uppercase(self) -> bool {
2348-
self.is_char_and(char::is_uppercase)
2349-
}
23502336
fn bytes_len(self) -> usize {
23512337
self.len_wtf8()
23522338
}
@@ -2459,14 +2445,6 @@ impl AnyStrContainer<AsciiStr> for AsciiString {
24592445
}
24602446

24612447
impl anystr::AnyChar for ascii::AsciiChar {
2462-
fn is_lowercase(self) -> bool {
2463-
self.is_lowercase()
2464-
}
2465-
2466-
fn is_uppercase(self) -> bool {
2467-
self.is_uppercase()
2468-
}
2469-
24702448
fn bytes_len(self) -> usize {
24712449
1
24722450
}

crates/vm/src/bytes_inner.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1031,14 +1031,6 @@ impl AnyStrContainer<[u8]> for Vec<u8> {
10311031
const ASCII_WHITESPACES: [u8; 6] = [0x20, 0x09, 0x0a, 0x0c, 0x0d, 0x0b];
10321032

10331033
impl anystr::AnyChar for u8 {
1034-
fn is_lowercase(self) -> bool {
1035-
self.is_ascii_lowercase()
1036-
}
1037-
1038-
fn is_uppercase(self) -> bool {
1039-
self.is_ascii_uppercase()
1040-
}
1041-
10421034
fn bytes_len(self) -> usize {
10431035
1
10441036
}

extra_tests/snippets/builtin_str.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,12 @@
222222
assert "abc\t12345\txyz".expandtabs() == "abc 12345 xyz"
223223
assert "-".join(["1", "2", "3"]) == "1-2-3"
224224
assert "HALLO".isupper()
225+
assert not "123".isupper()
226+
assert not "123".islower()
227+
assert not "\U0001f431".isupper()
228+
assert not "\U0001f431".islower()
229+
assert "\U0001f431 CAT".isupper()
230+
assert "\U0001f431 cat".islower()
225231
assert "\u0295".islower()
226232
assert "\u1c89".isupper()
227233
assert "hello, my name is".partition("my ") == ("hello, ", "my ", "name is")

0 commit comments

Comments
 (0)