From 5c5aff8fd9729050681697fc81a1c721b2199d92 Mon Sep 17 00:00:00 2001 From: ShaharNaveh Date: Fri, 6 Mar 2026 09:47:14 +0900 Subject: [PATCH 1/3] Base implementation of _tokenize module Port from PR #6240 by ShaharNaveh, adapted to current codebase. Uses ruff_python_parser for tokenization via TokenizerIter. --- Cargo.lock | 4 + crates/stdlib/Cargo.toml | 5 + crates/stdlib/src/lib.rs | 2 + crates/stdlib/src/tokenize.rs | 391 ++++++++++++++++++++++++++++++++++ 4 files changed, 402 insertions(+) create mode 100644 crates/stdlib/src/tokenize.rs diff --git a/Cargo.lock b/Cargo.lock index ffece1cb31a..1a291035bb0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3283,6 +3283,10 @@ dependencies = [ "pkcs8", "pymath", "rand_core 0.9.5", + "ruff_python_ast", + "ruff_python_parser", + "ruff_source_file", + "ruff_text_size", "rustix", "rustls", "rustls-native-certs", diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml index a40a5bf24a8..7d230fcc046 100644 --- a/crates/stdlib/Cargo.toml +++ b/crates/stdlib/Cargo.toml @@ -31,6 +31,11 @@ rustpython-derive = { workspace = true } rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]} rustpython-common = { workspace = true } +ruff_python_parser = { workspace = true } +ruff_python_ast = { workspace = true } +ruff_text_size = { workspace = true } +ruff_source_file = { workspace = true } + ahash = { workspace = true } ascii = { workspace = true } cfg-if = { workspace = true } diff --git a/crates/stdlib/src/lib.rs b/crates/stdlib/src/lib.rs index 8c234c22f89..04aa623d185 100644 --- a/crates/stdlib/src/lib.rs +++ b/crates/stdlib/src/lib.rs @@ -49,6 +49,7 @@ mod pystruct; mod random; mod statistics; mod suggestions; +mod tokenize; // TODO: maybe make this an extension module, if we ever get those // mod re; #[cfg(all(feature = "host_env", not(target_arch = "wasm32")))] @@ -225,6 +226,7 @@ pub fn stdlib_module_defs(ctx: &Context) -> Vec<&'static builtins::PyModuleDef> ssl::module_def(ctx), statistics::module_def(ctx), suggestions::module_def(ctx), + tokenize::module_def(ctx), #[cfg(all(feature = "host_env", unix, not(target_os = "redox")))] syslog::module_def(ctx), #[cfg(all( diff --git a/crates/stdlib/src/tokenize.rs b/crates/stdlib/src/tokenize.rs new file mode 100644 index 00000000000..33667a203ec --- /dev/null +++ b/crates/stdlib/src/tokenize.rs @@ -0,0 +1,391 @@ +pub(crate) use _tokenize::module_def; + +#[pymodule] +mod _tokenize { + use crate::{ + common::lock::PyRwLock, + vm::{ + AsObject, Py, PyPayload, PyResult, VirtualMachine, + builtins::{PyBytes, PyStr, PyType}, + convert::ToPyObject, + function::ArgCallable, + protocol::PyIterReturn, + types::{Constructor, IterNext, Iterable, SelfIter}, + }, + }; + use ruff_python_ast::PySourceType; + use ruff_python_ast::token::{Token, TokenKind, Tokens}; + use ruff_python_parser::{ParseError, parse_unchecked_source}; + use ruff_source_file::{LineIndex, LineRanges}; + use ruff_text_size::{Ranged, TextRange}; + use std::{cmp::Ordering, fmt}; + + /// `__import__("token").OP` + const TOKEN_OP: u8 = 55; + + #[pyattr] + #[pyclass(name = "TokenizerIter")] + #[derive(PyPayload)] + pub struct PyTokenizerIter { + readline: ArgCallable, // TODO: This should be PyObject + extra_tokens: bool, + encoding: Option, + state: PyRwLock, + } + + impl PyTokenizerIter { + fn readline(&self, vm: &VirtualMachine) -> PyResult { + // TODO: When `readline` is PyObject, + // we need to check if it's callable and raise a type error if it's not. + let raw_line = match self.readline.invoke((), vm) { + Ok(v) => v, + Err(err) => { + if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) { + return Ok(String::new()); + } + return Err(err); + } + }; + Ok(match &self.encoding { + Some(encoding) => { + let bytes = raw_line + .downcast::() + .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?; + vm.state + .codec_registry + .decode_text(bytes.into(), encoding, None, vm) + .map(|s| s.to_string())? + } + None => raw_line + .downcast::() + .map(|s| s.to_string()) + .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?, + }) + } + } + + impl fmt::Debug for PyTokenizerIter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PyTokenizerIter") + .field("readline", &self.readline) + .field("encoding", &self.encoding) + .field("extra_tokens", &self.extra_tokens) + .finish() + } + } + + #[pyclass(with(Constructor, Iterable, IterNext))] + impl PyTokenizerIter {} + + impl Constructor for PyTokenizerIter { + type Args = PyTokenizerIterArgs; + + fn py_new(_cls: &Py, args: Self::Args, _vm: &VirtualMachine) -> PyResult { + let Self::Args { + readline, + extra_tokens, + encoding, + } = args; + + Ok(Self { + readline, + extra_tokens, + encoding: encoding.map(|s| s.to_string()), + state: PyRwLock::new(PyTokenizerIterState::default()), + }) + } + } + + impl SelfIter for PyTokenizerIter {} + + impl IterNext for PyTokenizerIter { + fn next(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let mut state = { + let guard = zelf.state.read(); + guard.clone() + }; + + if state.eof { + return Ok(PyIterReturn::StopIteration(None)); + } + + let token = loop { + // TODO: Check here for errors. Raise SyntaxError if needed + + if let Some(tok) = state.next_token() { + break tok; + } + + let nline = zelf.readline(vm)?; + if nline.is_empty() { + state.eof = true; + *zelf.state.write() = state.clone(); + + let line_num = &state.start().0; + let out = vm + .ctx + .new_tuple(vec![ + token_kind_value(TokenKind::EndOfFile).to_pyobject(vm), + vm.ctx.new_str("").into(), + vm.ctx + .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)]) + .into(), + vm.ctx + .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)]) + .into(), + vm.ctx.new_str(state.current_line()).into(), + ]) + .into(); + return Ok(PyIterReturn::Return(out)); + } + state.push_line(&nline); + }; + + *zelf.state.write() = state.clone(); + + let token_kind = token.kind(); + let token_value = if zelf.extra_tokens && token_kind.is_operator() { + TOKEN_OP + } else { + token_kind_value(token_kind) + }; + let (start_x, start_y) = &state.start(); + let (end_x, end_y) = &state.end(); + + let mut token_repr = &state.source[state.range()]; + if !zelf.extra_tokens { + token_repr = token_repr.trim(); + } + + let out = vm + .ctx + .new_tuple(vec![ + token_value.to_pyobject(vm), + vm.ctx.new_str(token_repr).into(), + vm.ctx + .new_tuple(vec![start_x.to_pyobject(vm), start_y.to_pyobject(vm)]) + .into(), + vm.ctx + .new_tuple(vec![end_x.to_pyobject(vm), end_y.to_pyobject(vm)]) + .into(), + vm.ctx.new_str(state.current_line()).into(), + ]) + .into(); + Ok(PyIterReturn::Return(out)) + } + } + + #[derive(FromArgs)] + pub struct PyTokenizerIterArgs { + #[pyarg(positional)] + readline: ArgCallable, + #[pyarg(named)] + extra_tokens: bool, + #[pyarg(named, optional)] + encoding: Option>, + } + + #[derive(Clone, Debug)] + struct PyTokenizerIterState { + /// Source code. + source: String, + prev_token: Option, + /// Tokens of `source`. + tokens: Tokens, + /// Errors of `source` + errors: Vec, + /// LineIndex of `source`. + line_index: LineIndex, + /// Marker that says we already emitted EOF, and needs to stop iterating. + eof: bool, + } + + impl PyTokenizerIterState { + fn push_line(&mut self, line: &str) { + self.source.push_str(line); + + let parsed = parse_unchecked_source(&self.source, PySourceType::Python); + self.tokens = parsed.tokens().clone(); + self.errors = parsed.errors().to_vec(); + self.line_index = LineIndex::from_source_text(&self.source); + } + + #[must_use] + fn current_line(&self) -> &str { + let (kind, range): (TokenKind, TextRange) = match self.prev_token { + Some(token) => token.as_tuple(), + None => (TokenKind::Unknown, TextRange::default()), + }; + + match kind { + TokenKind::Newline => self.source.full_line_str(range.start()), + _ => self.source.full_lines_str(range), + } + } + + #[must_use] + fn next_token(&mut self) -> Option { + for token in self.tokens.iter() { + let (kind, range): (TokenKind, TextRange) = token.as_tuple(); + + if matches!(kind, TokenKind::NonLogicalNewline) { + continue; + } + + if matches!(range.ordering(self.range()), Ordering::Greater) { + self.prev_token = Some(*token); + return self.prev_token; + } + } + + None + } + + #[must_use] + fn range(&self) -> TextRange { + match self.prev_token { + Some(token) => token.range(), + None => TextRange::default(), + } + } + + #[must_use] + fn start(&self) -> (usize, usize) { + let lc = self + .line_index + .line_column(self.range().start(), &self.source); + (lc.line.get(), lc.column.to_zero_indexed()) + } + + #[must_use] + fn end(&self) -> (usize, usize) { + let lc = self + .line_index + .line_column(self.range().end(), &self.source); + (lc.line.get(), lc.column.to_zero_indexed()) + } + } + + impl Default for PyTokenizerIterState { + fn default() -> Self { + const SOURCE: &str = ""; + let parsed = parse_unchecked_source(SOURCE, PySourceType::Python); + + Self { + source: SOURCE.to_owned(), + prev_token: None, + tokens: parsed.tokens().clone(), + errors: parsed.errors().to_vec(), + line_index: LineIndex::from_source_text(SOURCE), + eof: false, + } + } + } + + const fn token_kind_value(kind: TokenKind) -> u8 { + match kind { + TokenKind::EndOfFile => 0, + TokenKind::Name + | TokenKind::For + | TokenKind::In + | TokenKind::Pass + | TokenKind::Class + | TokenKind::And + | TokenKind::Is + | TokenKind::Raise + | TokenKind::True + | TokenKind::False + | TokenKind::Assert + | TokenKind::Try + | TokenKind::While + | TokenKind::Yield + | TokenKind::Lambda + | TokenKind::None + | TokenKind::Not + | TokenKind::Or + | TokenKind::Break + | TokenKind::Continue + | TokenKind::Global + | TokenKind::Nonlocal + | TokenKind::Return + | TokenKind::Except + | TokenKind::Import + | TokenKind::Case + | TokenKind::Match + | TokenKind::Type + | TokenKind::Await + | TokenKind::With + | TokenKind::Del + | TokenKind::Finally + | TokenKind::From + | TokenKind::Def + | TokenKind::If + | TokenKind::Else + | TokenKind::Elif + | TokenKind::As + | TokenKind::Async => 1, + TokenKind::Int | TokenKind::Complex | TokenKind::Float => 2, + TokenKind::String => 3, + TokenKind::Newline | TokenKind::NonLogicalNewline => 4, + TokenKind::Indent => 5, + TokenKind::Dedent => 6, + TokenKind::Lpar => 7, + TokenKind::Rpar => 8, + TokenKind::Lsqb => 9, + TokenKind::Rsqb => 10, + TokenKind::Colon => 11, + TokenKind::Comma => 12, + TokenKind::Semi => 13, + TokenKind::Plus => 14, + TokenKind::Minus => 15, + TokenKind::Star => 16, + TokenKind::Slash => 17, + TokenKind::Vbar => 18, + TokenKind::Amper => 19, + TokenKind::Less => 20, + TokenKind::Greater => 21, + TokenKind::Equal => 22, + TokenKind::Dot => 23, + TokenKind::Percent => 24, + TokenKind::Lbrace => 25, + TokenKind::Rbrace => 26, + TokenKind::EqEqual => 27, + TokenKind::NotEqual => 28, + TokenKind::LessEqual => 29, + TokenKind::GreaterEqual => 30, + TokenKind::Tilde => 31, + TokenKind::CircumFlex => 32, + TokenKind::LeftShift => 33, + TokenKind::RightShift => 34, + TokenKind::DoubleStar => 35, + TokenKind::PlusEqual => 36, + TokenKind::MinusEqual => 37, + TokenKind::StarEqual => 38, + TokenKind::SlashEqual => 39, + TokenKind::PercentEqual => 40, + TokenKind::AmperEqual => 41, + TokenKind::VbarEqual => 42, + TokenKind::CircumflexEqual => 43, + TokenKind::LeftShiftEqual => 44, + TokenKind::RightShiftEqual => 45, + TokenKind::DoubleStarEqual => 46, + TokenKind::DoubleSlash => 47, + TokenKind::DoubleSlashEqual => 48, + TokenKind::At => 49, + TokenKind::AtEqual => 50, + TokenKind::Rarrow => 51, + TokenKind::Ellipsis => 52, + TokenKind::ColonEqual => 53, + TokenKind::Exclamation => 54, + TokenKind::FStringStart => 59, + TokenKind::FStringMiddle => 60, + TokenKind::FStringEnd => 61, + TokenKind::Comment => 62, + TokenKind::TStringStart => 62, // 3.14 compatible + TokenKind::TStringMiddle => 63, // 3.14 compatible + TokenKind::TStringEnd => 64, // 3.14 compatible + TokenKind::IpyEscapeCommand | TokenKind::Question => 0, // Ruff's specific + TokenKind::Unknown => 0, + } + } +} From bf2b993c93ad55363aa0b8c95222aab3b58c135e Mon Sep 17 00:00:00 2001 From: CPython Developers <> Date: Fri, 6 Mar 2026 09:53:33 +0900 Subject: [PATCH 2/3] Update tokenize from v3.14.3 --- Lib/test/test_tokenize.py | 2150 ++++++++++++++++++++++++++++++++++--- Lib/tokenize.py | 353 +++--- 2 files changed, 2126 insertions(+), 377 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 44ef4e24165..c10f80a723c 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,17 +1,22 @@ -from test import support -from test.support import os_helper -from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, - STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, - open as tokenize_open, Untokenizer, generate_tokens, - NEWLINE) -from io import BytesIO, StringIO +import contextlib +import itertools +import os +import re +import string +import tempfile +import token +import tokenize import unittest +from io import BytesIO, StringIO from textwrap import dedent from unittest import TestCase, mock -from test.test_grammar import (VALID_UNDERSCORE_LITERALS, - INVALID_UNDERSCORE_LITERALS) -import os -import token +from test import support +from test.support import os_helper +from test.support.script_helper import run_test_script, make_script, run_python_until_end +from test.support.numbers import ( + VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS, +) # Converts a source string into a list of textual representation @@ -24,12 +29,12 @@ def stringify_tokens_from_source(token_generator, source_string): missing_trailing_nl = source_string[-1] not in '\r\n' for type, token, start, end, line in token_generator: - if type == ENDMARKER: + if type == tokenize.ENDMARKER: break # Ignore the new line on the last line if the input lacks one - if missing_trailing_nl and type == NEWLINE and end[0] == num_lines: + if missing_trailing_nl and type == tokenize.NEWLINE and end[0] == num_lines: continue - type = tok_name[type] + type = tokenize.tok_name[type] result.append(f" {type:10} {token!r:13} {start} {end}") return result @@ -45,18 +50,37 @@ def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. f = BytesIO(s.encode('utf-8')) - result = stringify_tokens_from_source(tokenize(f.readline), s) + result = stringify_tokens_from_source(tokenize.tokenize(f.readline), s) self.assertEqual(result, [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) + def test_invalid_readline(self): + def gen(): + yield "sdfosdg" + yield "sdfosdg" + with self.assertRaises(TypeError): + list(tokenize.tokenize(gen().__next__)) + + def gen(): + yield b"sdfosdg" + yield b"sdfosdg" + with self.assertRaises(TypeError): + list(tokenize.generate_tokens(gen().__next__)) + + def gen(): + yield "sdfosdg" + 1/0 + with self.assertRaises(ZeroDivisionError): + list(tokenize.generate_tokens(gen().__next__)) + def test_implicit_newline(self): # Make sure that the tokenizer puts in an implicit NEWLINE # when the input lacks a trailing new line. f = BytesIO("x".encode('utf-8')) - tokens = list(tokenize(f.readline)) - self.assertEqual(tokens[-2].type, NEWLINE) - self.assertEqual(tokens[-1].type, ENDMARKER) + tokens = list(tokenize.tokenize(f.readline)) + self.assertEqual(tokens[-2].type, tokenize.NEWLINE) + self.assertEqual(tokens[-1].type, tokenize.ENDMARKER) def test_basic(self): self.check_tokenize("1 + 1", """\ @@ -83,6 +107,32 @@ def test_basic(self): NEWLINE '\\n' (4, 26) (4, 27) DEDENT '' (5, 0) (5, 0) """) + + self.check_tokenize("if True:\r\n # NL\r\n foo='bar'\r\n\r\n", """\ + NAME 'if' (1, 0) (1, 2) + NAME 'True' (1, 3) (1, 7) + OP ':' (1, 7) (1, 8) + NEWLINE '\\r\\n' (1, 8) (1, 10) + COMMENT '# NL' (2, 4) (2, 8) + NL '\\r\\n' (2, 8) (2, 10) + INDENT ' ' (3, 0) (3, 4) + NAME 'foo' (3, 4) (3, 7) + OP '=' (3, 7) (3, 8) + STRING "\'bar\'" (3, 8) (3, 13) + NEWLINE '\\r\\n' (3, 13) (3, 15) + NL '\\r\\n' (4, 0) (4, 2) + DEDENT '' (5, 0) (5, 0) + """) + + self.check_tokenize("x = 1 + \\\r\n1\r\n", """\ + NAME 'x' (1, 0) (1, 1) + OP '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + OP '+' (1, 6) (1, 7) + NUMBER '1' (2, 0) (2, 1) + NEWLINE '\\r\\n' (2, 1) (2, 3) + """) + indent_error_file = b"""\ def k(x): x += 2 @@ -91,9 +141,18 @@ def k(x): readline = BytesIO(indent_error_file).readline with self.assertRaisesRegex(IndentationError, "unindent does not match any " - "outer indentation level"): - for tok in tokenize(readline): + "outer indentation level") as e: + for tok in tokenize.tokenize(readline): pass + self.assertEqual(e.exception.lineno, 3) + self.assertEqual(e.exception.filename, '') + self.assertEqual(e.exception.end_lineno, None) + self.assertEqual(e.exception.end_offset, None) + self.assertEqual( + e.exception.msg, + 'unindent does not match any outer indentation level') + self.assertEqual(e.exception.offset, 9) + self.assertEqual(e.exception.text, ' x += 5') def test_int(self): # Ordinary integers and binary operators @@ -177,7 +236,7 @@ def test_long(self): """) def test_float(self): - # Floating point numbers + # Floating-point numbers self.check_tokenize("x = 3.14159", """\ NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) @@ -219,8 +278,8 @@ def test_float(self): def test_underscore_literals(self): def number_token(s): f = BytesIO(s.encode('utf-8')) - for toktype, token, start, end, line in tokenize(f.readline): - if toktype == NUMBER: + for toktype, token, start, end, line in tokenize.tokenize(f.readline): + if toktype == tokenize.NUMBER: return token return 'invalid token' for lit in VALID_UNDERSCORE_LITERALS: @@ -228,7 +287,16 @@ def number_token(s): # this won't work with compound complex inputs continue self.assertEqual(number_token(lit), lit) + # Valid cases with extra underscores in the tokenize module + # See gh-105549 for context + extra_valid_cases = {"0_7", "09_99"} for lit in INVALID_UNDERSCORE_LITERALS: + if lit in extra_valid_cases: + continue + try: + number_token(lit) + except tokenize.TokenError: + continue self.assertNotEqual(number_token(lit), lit) def test_string(self): @@ -380,21 +448,175 @@ def test_string(self): STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) """) self.check_tokenize('f"abc"', """\ - STRING 'f"abc"' (1, 0) (1, 6) + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc' (1, 2) (1, 5) + FSTRING_END '"' (1, 5) (1, 6) """) self.check_tokenize('fR"a{b}c"', """\ - STRING 'fR"a{b}c"' (1, 0) (1, 9) + FSTRING_START 'fR"' (1, 0) (1, 3) + FSTRING_MIDDLE 'a' (1, 3) (1, 4) + OP '{' (1, 4) (1, 5) + NAME 'b' (1, 5) (1, 6) + OP '}' (1, 6) (1, 7) + FSTRING_MIDDLE 'c' (1, 7) (1, 8) + FSTRING_END '"' (1, 8) (1, 9) + """) + self.check_tokenize('fR"a{{{b!r}}}c"', """\ + FSTRING_START 'fR"' (1, 0) (1, 3) + FSTRING_MIDDLE 'a{' (1, 3) (1, 5) + OP '{' (1, 6) (1, 7) + NAME 'b' (1, 7) (1, 8) + OP '!' (1, 8) (1, 9) + NAME 'r' (1, 9) (1, 10) + OP '}' (1, 10) (1, 11) + FSTRING_MIDDLE '}' (1, 11) (1, 12) + FSTRING_MIDDLE 'c' (1, 13) (1, 14) + FSTRING_END '"' (1, 14) (1, 15) + """) + self.check_tokenize('f"{{{1+1}}}"', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE '{' (1, 2) (1, 3) + OP '{' (1, 4) (1, 5) + NUMBER '1' (1, 5) (1, 6) + OP '+' (1, 6) (1, 7) + NUMBER '1' (1, 7) (1, 8) + OP '}' (1, 8) (1, 9) + FSTRING_MIDDLE '}' (1, 9) (1, 10) + FSTRING_END '"' (1, 11) (1, 12) + """) + self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + OP '{' (1, 4) (1, 5) + FSTRING_START "f'''" (1, 5) (1, 9) + OP '{' (1, 9) (1, 10) + FSTRING_START "f'" (1, 10) (1, 12) + OP '{' (1, 12) (1, 13) + FSTRING_START 'f"' (1, 13) (1, 15) + OP '{' (1, 15) (1, 16) + NUMBER '1' (1, 16) (1, 17) + OP '+' (1, 17) (1, 18) + NUMBER '1' (1, 18) (1, 19) + OP '}' (1, 19) (1, 20) + FSTRING_END '"' (1, 20) (1, 21) + OP '}' (1, 21) (1, 22) + FSTRING_END "'" (1, 22) (1, 23) + OP '}' (1, 23) (1, 24) + FSTRING_END "'''" (1, 24) (1, 27) + OP '}' (1, 27) (1, 28) + FSTRING_END '\"""' (1, 28) (1, 31) + """) + self.check_tokenize('f""" x\nstr(data, encoding={invalid!r})\n"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE ' x\\nstr(data, encoding=' (1, 4) (2, 19) + OP '{' (2, 19) (2, 20) + NAME 'invalid' (2, 20) (2, 27) + OP '!' (2, 27) (2, 28) + NAME 'r' (2, 28) (2, 29) + OP '}' (2, 29) (2, 30) + FSTRING_MIDDLE ')\\n' (2, 30) (3, 0) + FSTRING_END '\"""' (3, 0) (3, 3) + """) + self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9) + OP '{' (2, 9) (2, 10) + NAME 'None' (2, 10) (2, 14) + OP '}' (2, 14) (2, 15) + FSTRING_MIDDLE 'bad' (2, 15) (2, 18) + FSTRING_END '\"""' (2, 18) (2, 21) """) self.check_tokenize('f"""abc"""', """\ - STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE 'abc' (1, 4) (1, 7) + FSTRING_END '\"""' (1, 7) (1, 10) """) self.check_tokenize(r'f"abc\ def"', """\ - STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) """) self.check_tokenize(r'Rf"abc\ def"', """\ - STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) + FSTRING_START 'Rf"' (1, 0) (1, 3) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) + """) + self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\ + FSTRING_START "f'" (1, 0) (1, 2) + FSTRING_MIDDLE 'some words ' (1, 2) (1, 13) + OP '{' (1, 13) (1, 14) + NAME 'a' (1, 14) (1, 15) + OP '+' (1, 15) (1, 16) + NAME 'b' (1, 16) (1, 17) + OP ':' (1, 17) (1, 18) + FSTRING_MIDDLE '.3f' (1, 18) (1, 21) + OP '}' (1, 21) (1, 22) + FSTRING_MIDDLE ' more words ' (1, 22) (1, 34) + OP '{' (1, 34) (1, 35) + NAME 'c' (1, 35) (1, 36) + OP '+' (1, 36) (1, 37) + NAME 'd' (1, 37) (1, 38) + OP '=' (1, 38) (1, 39) + OP '}' (1, 39) (1, 40) + FSTRING_MIDDLE ' final words' (1, 40) (1, 52) + FSTRING_END "'" (1, 52) (1, 53) + """) + self.check_tokenize("""\ +f'''{ +3 +=}'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + OP '{' (1, 4) (1, 5) + NL '\\n' (1, 5) (1, 6) + NUMBER '3' (2, 0) (2, 1) + NL '\\n' (2, 1) (2, 2) + OP '=' (3, 0) (3, 1) + OP '}' (3, 1) (3, 2) + FSTRING_END "'''" (3, 2) (3, 5) + """) + self.check_tokenize("""\ +f'''__{ + x:a +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + OP '{' (1, 6) (1, 7) + NL '\\n' (1, 7) (1, 8) + NAME 'x' (2, 4) (2, 5) + OP ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n' (2, 6) (3, 0) + OP '}' (3, 0) (3, 1) + FSTRING_MIDDLE '__' (3, 1) (3, 3) + FSTRING_END "'''" (3, 3) (3, 6) + """) + self.check_tokenize("""\ +f'''__{ + x:a + b + c + d +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + OP '{' (1, 6) (1, 7) + NL '\\n' (1, 7) (1, 8) + NAME 'x' (2, 4) (2, 5) + OP ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n b\\n c\\n d\\n' (2, 6) (6, 0) + OP '}' (6, 0) (6, 1) + FSTRING_MIDDLE '__' (6, 1) (6, 3) + FSTRING_END "'''" (6, 3) (6, 6) + """) + + self.check_tokenize("""\ + '''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli + aktualni pracownicy, obecni pracownicy''' +""", """\ + INDENT ' ' (1, 0) (1, 4) + STRING "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45) + NEWLINE '\\n' (2, 45) (2, 46) + DEDENT '' (3, 0) (3, 0) """) def test_function(self): @@ -945,29 +1167,98 @@ async def bar(): pass DEDENT '' (7, 0) (7, 0) """) + @unittest.expectedFailure # TODO: RUSTPYTHON; + " NEWLINE '\\n' (4, 1) (4, 2)"] + def test_newline_after_parenthesized_block_with_comment(self): + self.check_tokenize('''\ +[ + # A comment here + 1 +] +''', """\ + OP '[' (1, 0) (1, 1) + NL '\\n' (1, 1) (1, 2) + COMMENT '# A comment here' (2, 4) (2, 20) + NL '\\n' (2, 20) (2, 21) + NUMBER '1' (3, 4) (3, 5) + NL '\\n' (3, 5) (3, 6) + OP ']' (4, 0) (4, 1) + NEWLINE '\\n' (4, 1) (4, 2) + """) + + def test_closing_parenthesis_from_different_line(self): + self.check_tokenize("); x", """\ + OP ')' (1, 0) (1, 1) + OP ';' (1, 1) (1, 2) + NAME 'x' (1, 3) (1, 4) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON; ' FSTRING_END "\'\'\'" (2, 68) (2, 71)'] + def test_multiline_non_ascii_fstring(self): + self.check_tokenize("""\ +a = f''' + Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli'''""", """\ + NAME 'a' (1, 0) (1, 1) + OP '=' (1, 2) (1, 3) + FSTRING_START "f\'\'\'" (1, 4) (1, 8) + FSTRING_MIDDLE '\\n Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli' (1, 8) (2, 68) + FSTRING_END "\'\'\'" (2, 68) (2, 71) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON; Diff is 696 characters long. Set self.maxDiff to None to see it. + def test_multiline_non_ascii_fstring_with_expr(self): + self.check_tokenize("""\ +f''' + 🔗 This is a test {test_arg1}🔗 +🔗'''""", """\ + FSTRING_START "f\'\'\'" (1, 0) (1, 4) + FSTRING_MIDDLE '\\n 🔗 This is a test ' (1, 4) (2, 21) + OP '{' (2, 21) (2, 22) + NAME 'test_arg1' (2, 22) (2, 31) + OP '}' (2, 31) (2, 32) + FSTRING_MIDDLE '🔗\\n🔗' (2, 32) (3, 1) + FSTRING_END "\'\'\'" (3, 1) (3, 4) + """) + + # gh-139516, the '\n' is explicit to ensure no trailing whitespace which would invalidate the test + self.check_tokenize('''f"{f(a=lambda: 'à'\n)}"''', """\ + FSTRING_START \'f"\' (1, 0) (1, 2) + OP '{' (1, 2) (1, 3) + NAME 'f' (1, 3) (1, 4) + OP '(' (1, 4) (1, 5) + NAME 'a' (1, 5) (1, 6) + OP '=' (1, 6) (1, 7) + NAME 'lambda' (1, 7) (1, 13) + OP ':' (1, 13) (1, 14) + STRING "\'à\'" (1, 15) (1, 18) + NL '\\n' (1, 18) (1, 19) + OP ')' (2, 0) (2, 1) + OP '}' (2, 1) (2, 2) + FSTRING_END \'"\' (2, 2) (2, 3) + """) + class GenerateTokensTest(TokenizeTest): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. f = StringIO(s) - result = stringify_tokens_from_source(generate_tokens(f.readline), s) + result = stringify_tokens_from_source(tokenize.generate_tokens(f.readline), s) self.assertEqual(result, expected.rstrip().splitlines()) def decistmt(s): result = [] - g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string + g = tokenize.tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string for toknum, tokval, _, _, _ in g: - if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens + if toknum == tokenize.NUMBER and '.' in tokval: # replace NUMBER tokens result.extend([ - (NAME, 'Decimal'), - (OP, '('), - (STRING, repr(tokval)), - (OP, ')') + (tokenize.NAME, 'Decimal'), + (tokenize.OP, '('), + (tokenize.STRING, repr(tokval)), + (tokenize.OP, ')') ]) else: result.append((toknum, tokval)) - return untokenize(result).decode('utf-8') + return tokenize.untokenize(result).decode('utf-8').strip() class TestMisc(TestCase): @@ -991,6 +1282,13 @@ def test_decistmt(self): self.assertEqual(eval(decistmt(s)), Decimal('-3.217160342717258261933904529E-7')) + def test___all__(self): + expected = token.__all__ + [ + "TokenInfo", "TokenError", "generate_tokens", + "detect_encoding", "untokenize", "open", "tokenize", + ] + self.assertCountEqual(tokenize.__all__, expected) + class TestTokenizerAdheresToPep0263(TestCase): """ @@ -998,8 +1296,9 @@ class TestTokenizerAdheresToPep0263(TestCase): """ def _testFile(self, filename): - path = os.path.join(os.path.dirname(__file__), filename) - TestRoundtrip.check_roundtrip(self, open(path, 'rb')) + path = os.path.join(os.path.dirname(__file__), 'tokenizedata', filename) + with open(path, 'rb') as f: + TestRoundtrip.check_roundtrip(self, f) def test_utf8_coding_cookie_and_no_utf8_bom(self): f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' @@ -1024,8 +1323,6 @@ def test_utf8_coding_cookie_and_utf8_bom(self): f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' self._testFile(f) - # TODO: RUSTPYTHON - @unittest.expectedFailure # "bad_coding.py" and "bad_coding2.py" make the WASM CI fail def test_bad_coding_cookie(self): self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py') self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py') @@ -1041,33 +1338,18 @@ def readline(): nonlocal first if not first: first = True - return line + yield line else: - return b'' + yield b'' # skip the initial encoding token and the end tokens - tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] - expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + tokens = list(tokenize._generate_tokens_from_c_tokenizer(readline().__next__, + encoding='utf-8', + extra_tokens=True))[:-2] + expected_tokens = [tokenize.TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") - def test__tokenize_does_not_decode_with_encoding_none(self): - literal = '"ЉЊЈЁЂ"' - first = False - def readline(): - nonlocal first - if not first: - first = True - return literal - else: - return b'' - - # skip the end tokens - tokens = list(_tokenize(readline, encoding=None))[:-2] - expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] - self.assertEqual(tokens, expected_tokens, - "string not tokenized when encoding is None") - class TestDetectEncoding(TestCase): @@ -1084,24 +1366,63 @@ def readline(): def test_no_bom_no_encoding_cookie(self): lines = ( - b'# something\n', + b'#!/home/\xc3\xa4/bin/python\n', + b'# something \xe2\x82\xac\n', b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, list(lines[:2])) + def test_no_bom_no_encoding_cookie_first_line_error(self): + lines = ( + b'#!/home/\xa4/bin/python\n\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_no_bom_no_encoding_cookie_second_line_error(self): + lines = ( + b'#!/usr/bin/python\n', + b'# something \xe2\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + def test_bom_no_cookie(self): lines = ( - b'\xef\xbb\xbf# something\n', + b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n', b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, - [b'# something\n', b'print(something)\n']) + [b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n']) + + def test_bom_no_cookie_first_line_error(self): + lines = ( + b'\xef\xbb\xbf#!/home/\xa4/bin/python\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_bom_no_cookie_second_line_error(self): + lines = ( + b'\xef\xbb\xbf#!/usr/bin/python\n', + b'# something \xe2\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) def test_cookie_first_line_no_bom(self): lines = ( @@ -1109,7 +1430,7 @@ def test_cookie_first_line_no_bom(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso-8859-1') self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) @@ -1119,7 +1440,7 @@ def test_matched_bom_and_cookie_first_line(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) @@ -1130,7 +1451,7 @@ def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): b'do_something(else)\n' ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_cookie_second_line_no_bom(self): lines = ( @@ -1139,7 +1460,7 @@ def test_cookie_second_line_no_bom(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'ascii') expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] self.assertEqual(consumed_lines, expected) @@ -1151,7 +1472,7 @@ def test_matched_bom_and_cookie_second_line(self): b'print(something)\n', b'do_something(else)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'#! something\n', b'f# coding=utf-8\n']) @@ -1164,7 +1485,7 @@ def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): b'do_something(else)\n' ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_cookie_second_line_noncommented_first_line(self): lines = ( @@ -1172,21 +1493,65 @@ def test_cookie_second_line_noncommented_first_line(self): b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8') expected = [b"print('\xc2\xa3')\n"] self.assertEqual(consumed_lines, expected) - def test_cookie_second_line_commented_first_line(self): + def test_first_non_utf8_coding_line(self): lines = ( - b"#print('\xc2\xa3')\n", - b'# vim: set fileencoding=iso8859-15 :\n', - b"print('\xe2\x82\xac')\n" + b'#coding:iso-8859-15 \xa4\n', + b'print(something)\n' ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) - self.assertEqual(encoding, 'iso8859-15') - expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] - self.assertEqual(consumed_lines, expected) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso-8859-15') + self.assertEqual(consumed_lines, list(lines[:1])) + + def test_first_utf8_coding_line_error(self): + lines = ( + b'#coding:ascii \xc3\xa4\n', + b'print(something)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_second_non_utf8_coding_line(self): + lines = ( + b'#!/usr/bin/python\n', + b'#coding:iso-8859-15 \xa4\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso-8859-15') + self.assertEqual(consumed_lines, list(lines[:2])) + + def test_second_utf8_coding_line_error(self): + lines = ( + b'#!/usr/bin/python\n', + b'#coding:ascii \xc3\xa4\n', + b'print(something)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_non_utf8_shebang(self): + lines = ( + b'#!/home/\xa4/bin/python\n', + b'#coding:iso-8859-15\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso-8859-15') + self.assertEqual(consumed_lines, list(lines[:2])) + + def test_utf8_shebang_error(self): + lines = ( + b'#!/home/\xc3\xa4/bin/python\n', + b'#coding:ascii\n', + b'print(something)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) def test_cookie_second_line_empty_first_line(self): lines = ( @@ -1194,13 +1559,77 @@ def test_cookie_second_line_empty_first_line(self): b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) - encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso8859-15') expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected) + def test_cookie_third_line(self): + lines = ( + b'#!/home/\xc3\xa4/bin/python\n', + b'# something\n', + b'# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(consumed_lines, list(lines[:2])) + + def test_double_coding_line(self): + # If the first line matches the second line is ignored. + lines = ( + b'#coding:iso8859-15\n', + b'#coding:latin1\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + self.assertEqual(consumed_lines, list(lines[:1])) + + def test_double_coding_same_line(self): + lines = ( + b'#coding:iso8859-15 coding:latin1\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + self.assertEqual(consumed_lines, list(lines[:1])) + + def test_double_coding_utf8(self): + lines = ( + b'#coding:utf-8\n', + b'#coding:latin1\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(consumed_lines, list(lines[:1])) + + def test_nul_in_first_coding_line(self): + lines = ( + b'#coding:iso8859-15\x00\n', + b'\n', + b'\n', + b'print(something)\n' + ) + with self.assertRaisesRegex(SyntaxError, + "source code cannot contain null bytes"): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_nul_in_second_coding_line(self): + lines = ( + b'#!/usr/bin/python\n', + b'#coding:iso8859-15\x00\n', + b'\n', + b'print(something)\n' + ) + with self.assertRaisesRegex(SyntaxError, + "source code cannot contain null bytes"): + tokenize.detect_encoding(self.get_readline(lines)) + def test_latin1_normalization(self): - # See get_normal_name() in tokenizer.c. + # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", "iso-8859-1-unix", "iso-latin-1-mac") for encoding in encodings: @@ -1211,21 +1640,20 @@ def test_latin1_normalization(self): b"print(things)\n", b"do_something += 4\n") rl = self.get_readline(lines) - found, consumed_lines = detect_encoding(rl) + found, consumed_lines = tokenize.detect_encoding(rl) self.assertEqual(found, "iso-8859-1") def test_syntaxerror_latin1(self): - # Issue 14629: need to raise SyntaxError if the first + # Issue 14629: need to raise TokenError if the first # line(s) have non-UTF-8 characters lines = ( b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S ) readline = self.get_readline(lines) - self.assertRaises(SyntaxError, detect_encoding, readline) - + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_utf8_normalization(self): - # See get_normal_name() in tokenizer.c. + # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix") for encoding in encodings: for rep in ("-", "_"): @@ -1234,39 +1662,40 @@ def test_utf8_normalization(self): b"# coding: " + enc.encode("ascii") + b"\n", b"1 + 3\n") rl = self.get_readline(lines) - found, consumed_lines = detect_encoding(rl) + found, consumed_lines = tokenize.detect_encoding(rl) self.assertEqual(found, "utf-8") def test_short_files(self): readline = self.get_readline((b'print(something)\n',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, [b'print(something)\n']) - encoding, consumed_lines = detect_encoding(self.get_readline(())) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(())) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, []) readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, [b'print(something)\n']) readline = self.get_readline((b'\xef\xbb\xbf',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, []) readline = self.get_readline((b'# coding: bad\n',)) - self.assertRaises(SyntaxError, detect_encoding, readline) + self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) def test_false_encoding(self): # Issue 18873: "Encoding" detected in non-comment lines readline = self.get_readline((b'print("#coding=fake")',)) - encoding, consumed_lines = detect_encoding(readline) + encoding, consumed_lines = tokenize.detect_encoding(readline) self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, [b'print("#coding=fake")']) + @support.thread_unsafe def test_open(self): filename = os_helper.TESTFN + '.py' self.addCleanup(os_helper.unlink, filename) @@ -1276,14 +1705,14 @@ def test_open(self): with open(filename, 'w', encoding=encoding) as fp: print("# coding: %s" % encoding, file=fp) print("print('euro:\u20ac')", file=fp) - with tokenize_open(filename) as fp: + with tokenize.open(filename) as fp: self.assertEqual(fp.encoding, encoding) self.assertEqual(fp.mode, 'r') # test BOM (no coding cookie) with open(filename, 'w', encoding='utf-8-sig') as fp: print("print('euro:\u20ac')", file=fp) - with tokenize_open(filename) as fp: + with tokenize.open(filename) as fp: self.assertEqual(fp.encoding, 'utf-8-sig') self.assertEqual(fp.mode, 'r') @@ -1310,16 +1739,16 @@ def readline(self): ins = Bunk(lines, path) # Make sure lacking a name isn't an issue. del ins.name - detect_encoding(ins.readline) + tokenize.detect_encoding(ins.readline) with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): ins = Bunk(lines, path) - detect_encoding(ins.readline) + tokenize.detect_encoding(ins.readline) def test_open_error(self): # Issue #23840: open() must close the binary file on error m = BytesIO(b'#coding:xxx') with mock.patch('tokenize._builtin_open', return_value=m): - self.assertRaises(SyntaxError, tokenize_open, 'foobar') + self.assertRaises(SyntaxError, tokenize.open, 'foobar') self.assertTrue(m.closed) @@ -1327,17 +1756,20 @@ class TestTokenize(TestCase): def test_tokenize(self): import tokenize as tokenize_module - encoding = object() + encoding = "utf-8" encoding_used = None def mock_detect_encoding(readline): return encoding, [b'first', b'second'] - def mock__tokenize(readline, encoding): + def mock__tokenize(readline, encoding, **kwargs): nonlocal encoding_used encoding_used = encoding out = [] while True: - next_line = readline() + try: + next_line = readline() + except StopIteration: + return out if next_line: out.append(next_line) continue @@ -1352,16 +1784,16 @@ def mock_readline(): return str(counter).encode() orig_detect_encoding = tokenize_module.detect_encoding - orig__tokenize = tokenize_module._tokenize + orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer tokenize_module.detect_encoding = mock_detect_encoding - tokenize_module._tokenize = mock__tokenize + tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize try: - results = tokenize(mock_readline) - self.assertEqual(list(results), + results = tokenize.tokenize(mock_readline) + self.assertEqual(list(results)[1:], [b'first', b'second', b'1', b'2', b'3', b'4']) finally: tokenize_module.detect_encoding = orig_detect_encoding - tokenize_module._tokenize = orig__tokenize + tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token self.assertEqual(encoding_used, encoding) @@ -1373,23 +1805,23 @@ def test_oneline_defs(self): buf = '\n'.join(buf) # Test that 500 consequent, one-line defs is OK - toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) + toks = list(tokenize.tokenize(BytesIO(buf.encode('utf-8')).readline)) self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER # [-2] is always NEWLINE def assertExactTypeEqual(self, opstr, *optypes): - tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) + tokens = list(tokenize.tokenize(BytesIO(opstr.encode('utf-8')).readline)) num_optypes = len(optypes) self.assertEqual(len(tokens), 3 + num_optypes) - self.assertEqual(tok_name[tokens[0].exact_type], - tok_name[ENCODING]) + self.assertEqual(tokenize.tok_name[tokens[0].exact_type], + tokenize.tok_name[tokenize.ENCODING]) for i in range(num_optypes): - self.assertEqual(tok_name[tokens[i + 1].exact_type], - tok_name[optypes[i]]) - self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], - tok_name[token.NEWLINE]) - self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type], - tok_name[token.ENDMARKER]) + self.assertEqual(tokenize.tok_name[tokens[i + 1].exact_type], + tokenize.tok_name[optypes[i]]) + self.assertEqual(tokenize.tok_name[tokens[1 + num_optypes].exact_type], + tokenize.tok_name[token.NEWLINE]) + self.assertEqual(tokenize.tok_name[tokens[2 + num_optypes].exact_type], + tokenize.tok_name[token.ENDMARKER]) def test_exact_type(self): self.assertExactTypeEqual('()', token.LPAR, token.RPAR) @@ -1439,11 +1871,11 @@ def test_exact_type(self): self.assertExactTypeEqual('@=', token.ATEQUAL) self.assertExactTypeEqual('a**2+b**2==c**2', - NAME, token.DOUBLESTAR, NUMBER, + tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER, token.PLUS, - NAME, token.DOUBLESTAR, NUMBER, + tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER, token.EQEQUAL, - NAME, token.DOUBLESTAR, NUMBER) + tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER) self.assertExactTypeEqual('{1, 2, 3}', token.LBRACE, token.NUMBER, token.COMMA, @@ -1463,19 +1895,55 @@ def test_pathological_trailing_whitespace(self): def test_comment_at_the_end_of_the_source_without_newline(self): # See http://bugs.python.org/issue44667 source = 'b = 1\n\n#test' - expected_tokens = [token.NAME, token.EQUAL, token.NUMBER, token.NEWLINE, token.NL, token.COMMENT] + expected_tokens = [ + tokenize.TokenInfo(type=token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''), + tokenize.TokenInfo(type=token.NAME, string='b', start=(1, 0), end=(1, 1), line='b = 1\n'), + tokenize.TokenInfo(type=token.OP, string='=', start=(1, 2), end=(1, 3), line='b = 1\n'), + tokenize.TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'), + tokenize.TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'), + tokenize.TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'), + tokenize.TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'), + tokenize.TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'), + tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='') + ] + + tokens = list(tokenize.tokenize(BytesIO(source.encode('utf-8')).readline)) + self.assertEqual(tokens, expected_tokens) + + @unittest.expectedFailure # TODO: RUSTPYTHON; Diff is 869 characters long. Set self.maxDiff to None to see it. + def test_newline_and_space_at_the_end_of_the_source_without_newline(self): + # See https://github.com/python/cpython/issues/105435 + source = 'a\n ' + expected_tokens = [ + tokenize.TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''), + tokenize.TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'), + tokenize.TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'), + tokenize.TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '), + tokenize.TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='') + ] + + tokens = list(tokenize.tokenize(BytesIO(source.encode('utf-8')).readline)) + self.assertEqual(tokens, expected_tokens) + + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: b'SyntaxError' not found in b'OSError: stream did not contain valid UTF-8\n' + def test_invalid_character_in_fstring_middle(self): + # See gh-103824 + script = b'''F""" + \xe5"""''' + + with os_helper.temp_dir() as temp_dir: + filename = os.path.join(temp_dir, "script.py") + with open(filename, 'wb') as file: + file.write(script) + rs, _ = run_python_until_end(filename) + self.assertIn(b"SyntaxError", rs.err) - tokens = list(tokenize(BytesIO(source.encode('utf-8')).readline)) - self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING]) - for i in range(6): - self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[expected_tokens[i]]) - self.assertEqual(tok_name[tokens[-1].exact_type], tok_name[token.ENDMARKER]) class UntokenizeTest(TestCase): def test_bad_input_order(self): # raise if previous row - u = Untokenizer() + u = tokenize.Untokenizer() u.prev_row = 2 u.prev_col = 2 with self.assertRaises(ValueError) as cm: @@ -1487,7 +1955,7 @@ def test_bad_input_order(self): def test_backslash_continuation(self): # The problem is that \ leaves no token - u = Untokenizer() + u = tokenize.Untokenizer() u.prev_row = 1 u.prev_col = 1 u.tokens = [] @@ -1499,17 +1967,33 @@ def test_backslash_continuation(self): TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') def test_iter_compat(self): - u = Untokenizer() - token = (NAME, 'Hello') - tokens = [(ENCODING, 'utf-8'), token] + u = tokenize.Untokenizer() + token = (tokenize.NAME, 'Hello') + tokens = [(tokenize.ENCODING, 'utf-8'), token] u.compat(token, iter([])) self.assertEqual(u.tokens, ["Hello "]) - u = Untokenizer() + u = tokenize.Untokenizer() self.assertEqual(u.untokenize(iter([token])), 'Hello ') - u = Untokenizer() + u = tokenize.Untokenizer() self.assertEqual(u.untokenize(iter(tokens)), 'Hello ') self.assertEqual(u.encoding, 'utf-8') - self.assertEqual(untokenize(iter(tokens)), b'Hello ') + self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ') + + +def contains_ambiguous_backslash(source): + """Return `True` if the source contains a backslash on a + line by itself. For example: + + a = (1 + \\ + ) + + Code like this cannot be untokenized exactly. This is because + the tokenizer does not produce any tokens for the line containing + the backslash and so there is no way to know its indent. + """ + pattern = re.compile(br'\n\s*\\\r?\n') + return pattern.search(source) is not None class TestRoundtrip(TestCase): @@ -1522,6 +2006,9 @@ def check_roundtrip(self, f): tokenize.untokenize(), and the latter tokenized again to 2-tuples. The test fails if the 3 pair tokenizations do not match. + If the source code can be untokenized unambiguously, the + untokenized code must match the original code exactly. + When untokenize bugs are fixed, untokenize with 5-tuples should reproduce code that does not contain a backslash continuation following spaces. A proper test should test this. @@ -1531,21 +2018,38 @@ def check_roundtrip(self, f): code = f.encode('utf-8') else: code = f.read() - f.close() readline = iter(code.splitlines(keepends=True)).__next__ - tokens5 = list(tokenize(readline)) + tokens5 = list(tokenize.tokenize(readline)) tokens2 = [tok[:2] for tok in tokens5] # Reproduce tokens2 from pairs - bytes_from2 = untokenize(tokens2) + bytes_from2 = tokenize.untokenize(tokens2) readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ - tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] + tokens2_from2 = [tok[:2] for tok in tokenize.tokenize(readline2)] self.assertEqual(tokens2_from2, tokens2) # Reproduce tokens2 from 5-tuples - bytes_from5 = untokenize(tokens5) + bytes_from5 = tokenize.untokenize(tokens5) readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ - tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] + tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) + if not contains_ambiguous_backslash(code): + # The BOM does not produce a token so there is no way to preserve it. + code_without_bom = code.removeprefix(b'\xef\xbb\xbf') + readline = iter(code_without_bom.splitlines(keepends=True)).__next__ + untokenized_code = tokenize.untokenize(tokenize.tokenize(readline)) + self.assertEqual(code_without_bom, untokenized_code) + + def check_line_extraction(self, f): + if isinstance(f, str): + code = f.encode('utf-8') + else: + code = f.read() + readline = iter(code.splitlines(keepends=True)).__next__ + for tok in tokenize.tokenize(readline): + if tok.type in {tokenize.ENCODING, tokenize.ENDMARKER}: + continue + self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]]) + def test_roundtrip(self): # There are some standard formatting practices that are easy to get right. @@ -1561,7 +2065,7 @@ def test_roundtrip(self): self.check_roundtrip("if x == 1 : \n" " print(x)\n") - fn = support.findfile("tokenize_tests.txt") + fn = support.findfile("tokenize_tests.txt", subdir="tokenizedata") with open(fn, 'rb') as f: self.check_roundtrip(f) self.check_roundtrip("if x == 1:\n" @@ -1585,6 +2089,67 @@ def test_roundtrip(self): " print('Can not import' # comment2\n)" "else: print('Loaded')\n") + self.check_roundtrip("f'\\N{EXCLAMATION MARK}'") + self.check_roundtrip(r"f'\\N{SNAKE}'") + self.check_roundtrip(r"f'\\N{{SNAKE}}'") + self.check_roundtrip(r"f'\N{SNAKE}'") + self.check_roundtrip(r"f'\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\N{SNAKE}'") + self.check_roundtrip(r"f'\\\\\\\N{SNAKE}'") + + self.check_roundtrip(r"f'\\N{1}'") + self.check_roundtrip(r"f'\\\\N{2}'") + self.check_roundtrip(r"f'\\\\\\N{3}'") + self.check_roundtrip(r"f'\\\\\\\\N{4}'") + + self.check_roundtrip(r"f'\\N{{'") + self.check_roundtrip(r"f'\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\N{{'") + self.check_roundtrip(r"f'\\\\\\\\N{{'") + + self.check_roundtrip(r"f'\n{{foo}}'") + self.check_roundtrip(r"f'\\n{{foo}}'") + self.check_roundtrip(r"f'\\\n{{foo}}'") + self.check_roundtrip(r"f'\\\\n{{foo}}'") + + self.check_roundtrip(r"f'\t{{foo}}'") + self.check_roundtrip(r"f'\\t{{foo}}'") + self.check_roundtrip(r"f'\\\t{{foo}}'") + self.check_roundtrip(r"f'\\\\t{{foo}}'") + + self.check_roundtrip(r"rf'\t{{foo}}'") + self.check_roundtrip(r"rf'\\t{{foo}}'") + self.check_roundtrip(r"rf'\\\t{{foo}}'") + self.check_roundtrip(r"rf'\\\\t{{foo}}'") + + self.check_roundtrip(r"rf'\{{foo}}'") + self.check_roundtrip(r"f'\\{{foo}}'") + self.check_roundtrip(r"rf'\\\{{foo}}'") + self.check_roundtrip(r"f'\\\\{{foo}}'") + cases = [ + """ +if 1: + "foo" +"bar" +""", + """ +if 1: + ("foo" + "bar") +""", + """ +if 1: + "foo" + "bar" +""" ] + for case in cases: + self.check_roundtrip(case) + + self.check_roundtrip(r"t'{ {}}'") + self.check_roundtrip(r"t'{f'{ {}}'}{ {}}'") + self.check_roundtrip(r"f'{t'{ {}}'}{ {}}'") + + def test_continuation(self): # Balancing continuation self.check_roundtrip("a = (3,4, \n" @@ -1611,26 +2176,15 @@ def test_string_concatenation(self): # Two string literals on the same line self.check_roundtrip("'' ''") - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON def test_random_files(self): # Test roundtrip on random python modules. # pass the '-ucpu' option to process the full directory. import glob, random - fn = support.findfile("tokenize_tests.txt") - tempdir = os.path.dirname(fn) or os.curdir + tempdir = os.path.dirname(__file__) or os.curdir testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py")) - # Tokenize is broken on test_pep3131.py because regular expressions are - # broken on the obscure unicode identifiers in it. *sigh* - # With roundtrip extended to test the 5-tuple mode of untokenize, - # 7 more testfiles fail. Remove them also until the failure is diagnosed. - - testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py")) - for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'): - testfiles.remove(os.path.join(tempdir, "test_%s.py") % f) - if not support.is_resource_enabled("cpu"): testfiles = random.sample(testfiles, 10) @@ -1640,12 +2194,13 @@ def test_random_files(self): with open(testfile, 'rb') as f: with self.subTest(file=testfile): self.check_roundtrip(f) + self.check_line_extraction(f) def roundtrip(self, code): if isinstance(code, str): code = code.encode('utf-8') - return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') + return tokenize.untokenize(tokenize.tokenize(BytesIO(code).readline)).decode('utf-8') def test_indentation_semantics_retained(self): """ @@ -1658,5 +2213,1288 @@ def test_indentation_semantics_retained(self): self.check_roundtrip(code) +class InvalidPythonTests(TestCase): + @unittest.expectedFailure # TODO: RUSTPYTHON; Diff is 1046 characters long. Set self.maxDiff to None to see it. + def test_number_followed_by_name(self): + # See issue #gh-105549 + source = "2sin(x)" + expected_tokens = [ + tokenize.TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'), + tokenize.TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'), + tokenize.TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'), + tokenize.TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'), + tokenize.TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'), + tokenize.TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'), + tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + + tokens = list(tokenize.generate_tokens(StringIO(source).readline)) + self.assertEqual(tokens, expected_tokens) + + @unittest.expectedFailure # TODO: RUSTPYTHON; Diff is 855 characters long. Set self.maxDiff to None to see it. + def test_number_starting_with_zero(self): + source = "01234" + expected_tokens = [ + tokenize.TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'), + tokenize.TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'), + tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + + tokens = list(tokenize.generate_tokens(StringIO(source).readline)) + self.assertEqual(tokens, expected_tokens) + +class CTokenizeTest(TestCase): + def check_tokenize(self, s, expected): + # Format the tokens in s in a table format. + # The ENDMARKER and final NEWLINE are omitted. + f = StringIO(s) + with self.subTest(source=s): + result = stringify_tokens_from_source( + tokenize._generate_tokens_from_c_tokenizer(f.readline), s + ) + self.assertEqual(result, expected.rstrip().splitlines()) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_encoding(self): + def readline(encoding): + yield "1+1".encode(encoding) + + expected = [ + tokenize.TokenInfo(type=tokenize.NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'), + tokenize.TokenInfo(type=tokenize.OP, string='+', start=(1, 1), end=(1, 2), line='1+1'), + tokenize.TokenInfo(type=tokenize.NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'), + tokenize.TokenInfo(type=tokenize.NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'), + tokenize.TokenInfo(type=tokenize.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + for encoding in ["utf-8", "latin-1", "utf-16"]: + with self.subTest(encoding=encoding): + tokens = list(tokenize._generate_tokens_from_c_tokenizer( + readline(encoding).__next__, + extra_tokens=True, + encoding=encoding, + )) + self.assertEqual(tokens, expected) + + def test_int(self): + + self.check_tokenize('0xff <= 255', """\ + NUMBER '0xff' (1, 0) (1, 4) + LESSEQUAL '<=' (1, 5) (1, 7) + NUMBER '255' (1, 8) (1, 11) + """) + + self.check_tokenize('0b10 <= 255', """\ + NUMBER '0b10' (1, 0) (1, 4) + LESSEQUAL '<=' (1, 5) (1, 7) + NUMBER '255' (1, 8) (1, 11) + """) + + self.check_tokenize('0o123 <= 0O123', """\ + NUMBER '0o123' (1, 0) (1, 5) + LESSEQUAL '<=' (1, 6) (1, 8) + NUMBER '0O123' (1, 9) (1, 14) + """) + + self.check_tokenize('1234567 > ~0x15', """\ + NUMBER '1234567' (1, 0) (1, 7) + GREATER '>' (1, 8) (1, 9) + TILDE '~' (1, 10) (1, 11) + NUMBER '0x15' (1, 11) (1, 15) + """) + + self.check_tokenize('2134568 != 1231515', """\ + NUMBER '2134568' (1, 0) (1, 7) + NOTEQUAL '!=' (1, 8) (1, 10) + NUMBER '1231515' (1, 11) (1, 18) + """) + + self.check_tokenize('(-124561-1) & 200000000', """\ + LPAR '(' (1, 0) (1, 1) + MINUS '-' (1, 1) (1, 2) + NUMBER '124561' (1, 2) (1, 8) + MINUS '-' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + RPAR ')' (1, 10) (1, 11) + AMPER '&' (1, 12) (1, 13) + NUMBER '200000000' (1, 14) (1, 23) + """) + + self.check_tokenize('0xdeadbeef != -1', """\ + NUMBER '0xdeadbeef' (1, 0) (1, 10) + NOTEQUAL '!=' (1, 11) (1, 13) + MINUS '-' (1, 14) (1, 15) + NUMBER '1' (1, 15) (1, 16) + """) + + self.check_tokenize('0xdeadc0de & 12345', """\ + NUMBER '0xdeadc0de' (1, 0) (1, 10) + AMPER '&' (1, 11) (1, 12) + NUMBER '12345' (1, 13) (1, 18) + """) + + self.check_tokenize('0xFF & 0x15 | 1234', """\ + NUMBER '0xFF' (1, 0) (1, 4) + AMPER '&' (1, 5) (1, 6) + NUMBER '0x15' (1, 7) (1, 11) + VBAR '|' (1, 12) (1, 13) + NUMBER '1234' (1, 14) (1, 18) + """) + + def test_float(self): + + self.check_tokenize('x = 3.14159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3.14159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 314159.', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '314159.' (1, 4) (1, 11) + """) + + self.check_tokenize('x = .314159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '.314159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 3e14159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3e14159' (1, 4) (1, 11) + """) + + self.check_tokenize('x = 3E123', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3E123' (1, 4) (1, 9) + """) + + self.check_tokenize('x+y = 3e-1230', """\ + NAME 'x' (1, 0) (1, 1) + PLUS '+' (1, 1) (1, 2) + NAME 'y' (1, 2) (1, 3) + EQUAL '=' (1, 4) (1, 5) + NUMBER '3e-1230' (1, 6) (1, 13) + """) + + self.check_tokenize('x = 3.14e159', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '3.14e159' (1, 4) (1, 12) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_string(self): + + self.check_tokenize('x = \'\'; y = ""', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "''" (1, 4) (1, 6) + SEMI ';' (1, 6) (1, 7) + NAME 'y' (1, 8) (1, 9) + EQUAL '=' (1, 10) (1, 11) + STRING '""' (1, 12) (1, 14) + """) + + self.check_tokenize('x = \'"\'; y = "\'"', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '\\'"\\'' (1, 4) (1, 7) + SEMI ';' (1, 7) (1, 8) + NAME 'y' (1, 9) (1, 10) + EQUAL '=' (1, 11) (1, 12) + STRING '"\\'"' (1, 13) (1, 16) + """) + + self.check_tokenize('x = "doesn\'t "shrink", does it"', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '"doesn\\'t "' (1, 4) (1, 14) + NAME 'shrink' (1, 14) (1, 20) + STRING '", does it"' (1, 20) (1, 31) + """) + + self.check_tokenize("x = 'abc' + 'ABC'", """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "'abc'" (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + STRING "'ABC'" (1, 12) (1, 17) + """) + + self.check_tokenize('y = "ABC" + "ABC"', """\ + NAME 'y' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING '"ABC"' (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + STRING '"ABC"' (1, 12) (1, 17) + """) + + self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING "r'abc'" (1, 4) (1, 10) + PLUS '+' (1, 11) (1, 12) + STRING "r'ABC'" (1, 13) (1, 19) + PLUS '+' (1, 20) (1, 21) + STRING "R'ABC'" (1, 22) (1, 28) + PLUS '+' (1, 29) (1, 30) + STRING "R'ABC'" (1, 31) (1, 37) + """) + + self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\ + NAME 'y' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + STRING 'r"abc"' (1, 4) (1, 10) + PLUS '+' (1, 11) (1, 12) + STRING 'r"ABC"' (1, 13) (1, 19) + PLUS '+' (1, 20) (1, 21) + STRING 'R"ABC"' (1, 22) (1, 28) + PLUS '+' (1, 29) (1, 30) + STRING 'R"ABC"' (1, 31) (1, 37) + """) + + self.check_tokenize("u'abc' + U'abc'", """\ + STRING "u'abc'" (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING "U'abc'" (1, 9) (1, 15) + """) + + self.check_tokenize('u"abc" + U"abc"', """\ + STRING 'u"abc"' (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING 'U"abc"' (1, 9) (1, 15) + """) + + self.check_tokenize("b'abc' + B'abc'", """\ + STRING "b'abc'" (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING "B'abc'" (1, 9) (1, 15) + """) + + self.check_tokenize('b"abc" + B"abc"', """\ + STRING 'b"abc"' (1, 0) (1, 6) + PLUS '+' (1, 7) (1, 8) + STRING 'B"abc"' (1, 9) (1, 15) + """) + + self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\ + STRING "br'abc'" (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING "bR'abc'" (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING "Br'abc'" (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING "BR'abc'" (1, 30) (1, 37) + """) + + self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\ + STRING 'br"abc"' (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING 'bR"abc"' (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING 'Br"abc"' (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING 'BR"abc"' (1, 30) (1, 37) + """) + + self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\ + STRING "rb'abc'" (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING "rB'abc'" (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING "Rb'abc'" (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING "RB'abc'" (1, 30) (1, 37) + """) + + self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\ + STRING 'rb"abc"' (1, 0) (1, 7) + PLUS '+' (1, 8) (1, 9) + STRING 'rB"abc"' (1, 10) (1, 17) + PLUS '+' (1, 18) (1, 19) + STRING 'Rb"abc"' (1, 20) (1, 27) + PLUS '+' (1, 28) (1, 29) + STRING 'RB"abc"' (1, 30) (1, 37) + """) + + self.check_tokenize('"a\\\nde\\\nfg"', """\ + STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3) + """) + + self.check_tokenize('u"a\\\nde"', """\ + STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3) + """) + + self.check_tokenize('rb"a\\\nd"', """\ + STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2) + """) + + self.check_tokenize(r'"""a\ +b"""', """\ + STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) + """) + self.check_tokenize(r'u"""a\ +b"""', """\ + STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) + """) + self.check_tokenize(r'rb"""a\ +b\ +c"""', """\ + STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) + """) + + self.check_tokenize(r'"hola\\\r\ndfgf"', """\ + STRING \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16) + """) + + self.check_tokenize('f"abc"', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc' (1, 2) (1, 5) + FSTRING_END '"' (1, 5) (1, 6) + """) + + self.check_tokenize('fR"a{b}c"', """\ + FSTRING_START 'fR"' (1, 0) (1, 3) + FSTRING_MIDDLE 'a' (1, 3) (1, 4) + LBRACE '{' (1, 4) (1, 5) + NAME 'b' (1, 5) (1, 6) + RBRACE '}' (1, 6) (1, 7) + FSTRING_MIDDLE 'c' (1, 7) (1, 8) + FSTRING_END '"' (1, 8) (1, 9) + """) + + self.check_tokenize('f"""abc"""', """\ + FSTRING_START 'f\"""' (1, 0) (1, 4) + FSTRING_MIDDLE 'abc' (1, 4) (1, 7) + FSTRING_END '\"""' (1, 7) (1, 10) + """) + + self.check_tokenize(r'f"abc\ +def"', """\ + FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 2) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) + """) + + self.check_tokenize('''\ +f"{ +a}"''', """\ + FSTRING_START 'f"' (1, 0) (1, 2) + LBRACE '{' (1, 2) (1, 3) + NAME 'a' (2, 0) (2, 1) + RBRACE '}' (2, 1) (2, 2) + FSTRING_END '"' (2, 2) (2, 3) + """) + + self.check_tokenize(r'Rf"abc\ +def"', """\ + FSTRING_START 'Rf"' (1, 0) (1, 3) + FSTRING_MIDDLE 'abc\\\\\\ndef' (1, 3) (2, 3) + FSTRING_END '"' (2, 3) (2, 4) + """) + + self.check_tokenize(r'f"hola\\\r\ndfgf"', """\ + FSTRING_START \'f"\' (1, 0) (1, 2) + FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16) + FSTRING_END \'"\' (1, 16) (1, 17) + """) + + self.check_tokenize("""\ +f'''__{ + x:a +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + LBRACE '{' (1, 6) (1, 7) + NAME 'x' (2, 4) (2, 5) + COLON ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n' (2, 6) (3, 0) + RBRACE '}' (3, 0) (3, 1) + FSTRING_MIDDLE '__' (3, 1) (3, 3) + FSTRING_END "'''" (3, 3) (3, 6) + """) + + self.check_tokenize("""\ +f'''__{ + x:a + b + c + d +}__'''""", """\ + FSTRING_START "f'''" (1, 0) (1, 4) + FSTRING_MIDDLE '__' (1, 4) (1, 6) + LBRACE '{' (1, 6) (1, 7) + NAME 'x' (2, 4) (2, 5) + COLON ':' (2, 5) (2, 6) + FSTRING_MIDDLE 'a\\n b\\n c\\n d\\n' (2, 6) (6, 0) + RBRACE '}' (6, 0) (6, 1) + FSTRING_MIDDLE '__' (6, 1) (6, 3) + FSTRING_END "'''" (6, 3) (6, 6) + """) + + def test_function(self): + + self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd22' (1, 4) (1, 7) + LPAR '(' (1, 7) (1, 8) + NAME 'a' (1, 8) (1, 9) + COMMA ',' (1, 9) (1, 10) + NAME 'b' (1, 11) (1, 12) + COMMA ',' (1, 12) (1, 13) + NAME 'c' (1, 14) (1, 15) + EQUAL '=' (1, 15) (1, 16) + NUMBER '2' (1, 16) (1, 17) + COMMA ',' (1, 17) (1, 18) + NAME 'd' (1, 19) (1, 20) + EQUAL '=' (1, 20) (1, 21) + NUMBER '2' (1, 21) (1, 22) + COMMA ',' (1, 22) (1, 23) + STAR '*' (1, 24) (1, 25) + NAME 'k' (1, 25) (1, 26) + RPAR ')' (1, 26) (1, 27) + COLON ':' (1, 27) (1, 28) + NAME 'pass' (1, 29) (1, 33) + """) + + self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd01v_' (1, 4) (1, 9) + LPAR '(' (1, 9) (1, 10) + NAME 'a' (1, 10) (1, 11) + EQUAL '=' (1, 11) (1, 12) + NUMBER '1' (1, 12) (1, 13) + COMMA ',' (1, 13) (1, 14) + STAR '*' (1, 15) (1, 16) + NAME 'k' (1, 16) (1, 17) + COMMA ',' (1, 17) (1, 18) + DOUBLESTAR '**' (1, 19) (1, 21) + NAME 'w' (1, 21) (1, 22) + RPAR ')' (1, 22) (1, 23) + COLON ':' (1, 23) (1, 24) + NAME 'pass' (1, 25) (1, 29) + """) + + self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'd23' (1, 4) (1, 7) + LPAR '(' (1, 7) (1, 8) + NAME 'a' (1, 8) (1, 9) + COLON ':' (1, 9) (1, 10) + NAME 'str' (1, 11) (1, 14) + COMMA ',' (1, 14) (1, 15) + NAME 'b' (1, 16) (1, 17) + COLON ':' (1, 17) (1, 18) + NAME 'int' (1, 19) (1, 22) + EQUAL '=' (1, 22) (1, 23) + NUMBER '3' (1, 23) (1, 24) + RPAR ')' (1, 24) (1, 25) + RARROW '->' (1, 26) (1, 28) + NAME 'int' (1, 29) (1, 32) + COLON ':' (1, 32) (1, 33) + NAME 'pass' (1, 34) (1, 38) + """) + + def test_comparison(self): + + self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " + "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\ + NAME 'if' (1, 0) (1, 2) + NUMBER '1' (1, 3) (1, 4) + LESS '<' (1, 5) (1, 6) + NUMBER '1' (1, 7) (1, 8) + GREATER '>' (1, 9) (1, 10) + NUMBER '1' (1, 11) (1, 12) + EQEQUAL '==' (1, 13) (1, 15) + NUMBER '1' (1, 16) (1, 17) + GREATEREQUAL '>=' (1, 18) (1, 20) + NUMBER '5' (1, 21) (1, 22) + LESSEQUAL '<=' (1, 23) (1, 25) + NUMBER '0x15' (1, 26) (1, 30) + LESSEQUAL '<=' (1, 31) (1, 33) + NUMBER '0x12' (1, 34) (1, 38) + NOTEQUAL '!=' (1, 39) (1, 41) + NUMBER '1' (1, 42) (1, 43) + NAME 'and' (1, 44) (1, 47) + NUMBER '5' (1, 48) (1, 49) + NAME 'in' (1, 50) (1, 52) + NUMBER '1' (1, 53) (1, 54) + NAME 'not' (1, 55) (1, 58) + NAME 'in' (1, 59) (1, 61) + NUMBER '1' (1, 62) (1, 63) + NAME 'is' (1, 64) (1, 66) + NUMBER '1' (1, 67) (1, 68) + NAME 'or' (1, 69) (1, 71) + NUMBER '5' (1, 72) (1, 73) + NAME 'is' (1, 74) (1, 76) + NAME 'not' (1, 77) (1, 80) + NUMBER '1' (1, 81) (1, 82) + COLON ':' (1, 82) (1, 83) + NAME 'pass' (1, 84) (1, 88) + """) + + def test_additive(self): + + self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + MINUS '-' (1, 6) (1, 7) + NAME 'y' (1, 8) (1, 9) + PLUS '+' (1, 10) (1, 11) + NUMBER '15' (1, 12) (1, 14) + MINUS '-' (1, 15) (1, 16) + NUMBER '1' (1, 17) (1, 18) + PLUS '+' (1, 19) (1, 20) + NUMBER '0x124' (1, 21) (1, 26) + PLUS '+' (1, 27) (1, 28) + NAME 'z' (1, 29) (1, 30) + PLUS '+' (1, 31) (1, 32) + NAME 'a' (1, 33) (1, 34) + LSQB '[' (1, 34) (1, 35) + NUMBER '5' (1, 35) (1, 36) + RSQB ']' (1, 36) (1, 37) + """) + + def test_multiplicative(self): + + self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\ + NAME 'x' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + NUMBER '1' (1, 4) (1, 5) + DOUBLESLASH '//' (1, 5) (1, 7) + NUMBER '1' (1, 7) (1, 8) + STAR '*' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + SLASH '/' (1, 10) (1, 11) + NUMBER '5' (1, 11) (1, 12) + STAR '*' (1, 12) (1, 13) + NUMBER '12' (1, 13) (1, 15) + PERCENT '%' (1, 15) (1, 16) + NUMBER '0x12' (1, 16) (1, 20) + AT '@' (1, 20) (1, 21) + NUMBER '42' (1, 21) (1, 23) + """) + + def test_unary(self): + + self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\ + TILDE '~' (1, 0) (1, 1) + NUMBER '1' (1, 1) (1, 2) + CIRCUMFLEX '^' (1, 3) (1, 4) + NUMBER '1' (1, 5) (1, 6) + AMPER '&' (1, 7) (1, 8) + NUMBER '1' (1, 9) (1, 10) + VBAR '|' (1, 11) (1, 12) + NUMBER '1' (1, 12) (1, 13) + CIRCUMFLEX '^' (1, 14) (1, 15) + MINUS '-' (1, 16) (1, 17) + NUMBER '1' (1, 17) (1, 18) + """) + + self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\ + MINUS '-' (1, 0) (1, 1) + NUMBER '1' (1, 1) (1, 2) + STAR '*' (1, 2) (1, 3) + NUMBER '1' (1, 3) (1, 4) + SLASH '/' (1, 4) (1, 5) + NUMBER '1' (1, 5) (1, 6) + PLUS '+' (1, 6) (1, 7) + NUMBER '1' (1, 7) (1, 8) + STAR '*' (1, 8) (1, 9) + NUMBER '1' (1, 9) (1, 10) + DOUBLESLASH '//' (1, 10) (1, 12) + NUMBER '1' (1, 12) (1, 13) + MINUS '-' (1, 14) (1, 15) + MINUS '-' (1, 16) (1, 17) + MINUS '-' (1, 17) (1, 18) + MINUS '-' (1, 18) (1, 19) + NUMBER '1' (1, 19) (1, 20) + DOUBLESTAR '**' (1, 20) (1, 22) + NUMBER '1' (1, 22) (1, 23) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_selector(self): + + self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ + NAME 'import' (1, 0) (1, 6) + NAME 'sys' (1, 7) (1, 10) + COMMA ',' (1, 10) (1, 11) + NAME 'time' (1, 12) (1, 16) + NEWLINE '' (1, 16) (1, 16) + NAME 'x' (2, 0) (2, 1) + EQUAL '=' (2, 2) (2, 3) + NAME 'sys' (2, 4) (2, 7) + DOT '.' (2, 7) (2, 8) + NAME 'modules' (2, 8) (2, 15) + LSQB '[' (2, 15) (2, 16) + STRING "'time'" (2, 16) (2, 22) + RSQB ']' (2, 22) (2, 23) + DOT '.' (2, 23) (2, 24) + NAME 'time' (2, 24) (2, 28) + LPAR '(' (2, 28) (2, 29) + RPAR ')' (2, 29) (2, 30) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_method(self): + + self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ + AT '@' (1, 0) (1, 1) + NAME 'staticmethod' (1, 1) (1, 13) + NEWLINE '' (1, 13) (1, 13) + NAME 'def' (2, 0) (2, 3) + NAME 'foo' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'x' (2, 8) (2, 9) + COMMA ',' (2, 9) (2, 10) + NAME 'y' (2, 10) (2, 11) + RPAR ')' (2, 11) (2, 12) + COLON ':' (2, 12) (2, 13) + NAME 'pass' (2, 14) (2, 18) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_tabs(self): + + self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ + AT '@' (1, 0) (1, 1) + NAME 'staticmethod' (1, 1) (1, 13) + NEWLINE '' (1, 13) (1, 13) + NAME 'def' (2, 0) (2, 3) + NAME 'foo' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'x' (2, 8) (2, 9) + COMMA ',' (2, 9) (2, 10) + NAME 'y' (2, 10) (2, 11) + RPAR ')' (2, 11) (2, 12) + COLON ':' (2, 12) (2, 13) + NAME 'pass' (2, 14) (2, 18) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_async(self): + + self.check_tokenize('async = 1', """\ + NAME 'async' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + NUMBER '1' (1, 8) (1, 9) + """) + + self.check_tokenize('a = (async = 1)', """\ + NAME 'a' (1, 0) (1, 1) + EQUAL '=' (1, 2) (1, 3) + LPAR '(' (1, 4) (1, 5) + NAME 'async' (1, 5) (1, 10) + EQUAL '=' (1, 11) (1, 12) + NUMBER '1' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + """) + + self.check_tokenize('async()', """\ + NAME 'async' (1, 0) (1, 5) + LPAR '(' (1, 5) (1, 6) + RPAR ')' (1, 6) (1, 7) + """) + + self.check_tokenize('class async(Bar):pass', """\ + NAME 'class' (1, 0) (1, 5) + NAME 'async' (1, 6) (1, 11) + LPAR '(' (1, 11) (1, 12) + NAME 'Bar' (1, 12) (1, 15) + RPAR ')' (1, 15) (1, 16) + COLON ':' (1, 16) (1, 17) + NAME 'pass' (1, 17) (1, 21) + """) + + self.check_tokenize('class async:pass', """\ + NAME 'class' (1, 0) (1, 5) + NAME 'async' (1, 6) (1, 11) + COLON ':' (1, 11) (1, 12) + NAME 'pass' (1, 12) (1, 16) + """) + + self.check_tokenize('await = 1', """\ + NAME 'await' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + NUMBER '1' (1, 8) (1, 9) + """) + + self.check_tokenize('foo.async', """\ + NAME 'foo' (1, 0) (1, 3) + DOT '.' (1, 3) (1, 4) + NAME 'async' (1, 4) (1, 9) + """) + + self.check_tokenize('async for a in b: pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'for' (1, 6) (1, 9) + NAME 'a' (1, 10) (1, 11) + NAME 'in' (1, 12) (1, 14) + NAME 'b' (1, 15) (1, 16) + COLON ':' (1, 16) (1, 17) + NAME 'pass' (1, 18) (1, 22) + """) + + self.check_tokenize('async with a as b: pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'with' (1, 6) (1, 10) + NAME 'a' (1, 11) (1, 12) + NAME 'as' (1, 13) (1, 15) + NAME 'b' (1, 16) (1, 17) + COLON ':' (1, 17) (1, 18) + NAME 'pass' (1, 19) (1, 23) + """) + + self.check_tokenize('async.foo', """\ + NAME 'async' (1, 0) (1, 5) + DOT '.' (1, 5) (1, 6) + NAME 'foo' (1, 6) (1, 9) + """) + + self.check_tokenize('async', """\ + NAME 'async' (1, 0) (1, 5) + """) + + self.check_tokenize('async\n#comment\nawait', """\ + NAME 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + NAME 'await' (3, 0) (3, 5) + """) + + self.check_tokenize('async\n...\nawait', """\ + NAME 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + ELLIPSIS '...' (2, 0) (2, 3) + NEWLINE '' (2, 3) (2, 3) + NAME 'await' (3, 0) (3, 5) + """) + + self.check_tokenize('async\nawait', """\ + NAME 'async' (1, 0) (1, 5) + NEWLINE '' (1, 5) (1, 5) + NAME 'await' (2, 0) (2, 5) + """) + + self.check_tokenize('foo.async + 1', """\ + NAME 'foo' (1, 0) (1, 3) + DOT '.' (1, 3) (1, 4) + NAME 'async' (1, 4) (1, 9) + PLUS '+' (1, 10) (1, 11) + NUMBER '1' (1, 12) (1, 13) + """) + + self.check_tokenize('async def foo(): pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NAME 'pass' (1, 17) (1, 21) + """) + + self.check_tokenize('''\ +async def foo(): + def foo(await): + await = 1 + if 1: + await +async += 1 +''', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NEWLINE '' (1, 16) (1, 16) + INDENT '' (2, -1) (2, -1) + NAME 'def' (2, 2) (2, 5) + NAME 'foo' (2, 6) (2, 9) + LPAR '(' (2, 9) (2, 10) + NAME 'await' (2, 10) (2, 15) + RPAR ')' (2, 15) (2, 16) + COLON ':' (2, 16) (2, 17) + NEWLINE '' (2, 17) (2, 17) + INDENT '' (3, -1) (3, -1) + NAME 'await' (3, 4) (3, 9) + EQUAL '=' (3, 10) (3, 11) + NUMBER '1' (3, 12) (3, 13) + NEWLINE '' (3, 13) (3, 13) + DEDENT '' (4, -1) (4, -1) + NAME 'if' (4, 2) (4, 4) + NUMBER '1' (4, 5) (4, 6) + COLON ':' (4, 6) (4, 7) + NEWLINE '' (4, 7) (4, 7) + INDENT '' (5, -1) (5, -1) + NAME 'await' (5, 4) (5, 9) + NEWLINE '' (5, 9) (5, 9) + DEDENT '' (6, -1) (6, -1) + DEDENT '' (6, -1) (6, -1) + NAME 'async' (6, 0) (6, 5) + PLUSEQUAL '+=' (6, 6) (6, 8) + NUMBER '1' (6, 9) (6, 10) + NEWLINE '' (6, 10) (6, 10) + """) + + self.check_tokenize('async def foo():\n async for i in 1: pass', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + RPAR ')' (1, 14) (1, 15) + COLON ':' (1, 15) (1, 16) + NEWLINE '' (1, 16) (1, 16) + INDENT '' (2, -1) (2, -1) + NAME 'async' (2, 2) (2, 7) + NAME 'for' (2, 8) (2, 11) + NAME 'i' (2, 12) (2, 13) + NAME 'in' (2, 14) (2, 16) + NUMBER '1' (2, 17) (2, 18) + COLON ':' (2, 18) (2, 19) + NAME 'pass' (2, 20) (2, 24) + DEDENT '' (2, -1) (2, -1) + """) + + self.check_tokenize('async def foo(async): await', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'foo' (1, 10) (1, 13) + LPAR '(' (1, 13) (1, 14) + NAME 'async' (1, 14) (1, 19) + RPAR ')' (1, 19) (1, 20) + COLON ':' (1, 20) (1, 21) + NAME 'await' (1, 22) (1, 27) + """) + + self.check_tokenize('''\ +def f(): + + def baz(): pass + async def bar(): pass + + await = 2''', """\ + NAME 'def' (1, 0) (1, 3) + NAME 'f' (1, 4) (1, 5) + LPAR '(' (1, 5) (1, 6) + RPAR ')' (1, 6) (1, 7) + COLON ':' (1, 7) (1, 8) + NEWLINE '' (1, 8) (1, 8) + INDENT '' (3, -1) (3, -1) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + LPAR '(' (3, 9) (3, 10) + RPAR ')' (3, 10) (3, 11) + COLON ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '' (3, 17) (3, 17) + NAME 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + LPAR '(' (4, 15) (4, 16) + RPAR ')' (4, 16) (4, 17) + COLON ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '' (4, 23) (4, 23) + NAME 'await' (6, 2) (6, 7) + EQUAL '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + self.check_tokenize('''\ +async def f(): + + def baz(): pass + async def bar(): pass + + await = 2''', """\ + NAME 'async' (1, 0) (1, 5) + NAME 'def' (1, 6) (1, 9) + NAME 'f' (1, 10) (1, 11) + LPAR '(' (1, 11) (1, 12) + RPAR ')' (1, 12) (1, 13) + COLON ':' (1, 13) (1, 14) + NEWLINE '' (1, 14) (1, 14) + INDENT '' (3, -1) (3, -1) + NAME 'def' (3, 2) (3, 5) + NAME 'baz' (3, 6) (3, 9) + LPAR '(' (3, 9) (3, 10) + RPAR ')' (3, 10) (3, 11) + COLON ':' (3, 11) (3, 12) + NAME 'pass' (3, 13) (3, 17) + NEWLINE '' (3, 17) (3, 17) + NAME 'async' (4, 2) (4, 7) + NAME 'def' (4, 8) (4, 11) + NAME 'bar' (4, 12) (4, 15) + LPAR '(' (4, 15) (4, 16) + RPAR ')' (4, 16) (4, 17) + COLON ':' (4, 17) (4, 18) + NAME 'pass' (4, 19) (4, 23) + NEWLINE '' (4, 23) (4, 23) + NAME 'await' (6, 2) (6, 7) + EQUAL '=' (6, 8) (6, 9) + NUMBER '2' (6, 10) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_unicode(self): + + self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ + NAME 'Örter' (1, 0) (1, 5) + EQUAL '=' (1, 6) (1, 7) + STRING "u'places'" (1, 8) (1, 17) + NEWLINE '' (1, 17) (1, 17) + NAME 'grün' (2, 0) (2, 4) + EQUAL '=' (2, 5) (2, 6) + STRING "U'green'" (2, 7) (2, 15) + """) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_invalid_syntax(self): + def get_tokens(string): + the_string = StringIO(string) + return list(tokenize._generate_tokens_from_c_tokenizer(the_string.readline)) + + for case in [ + "(1+2]", + "(1+2}", + "{1+2]", + "1_", + "1.2_", + "1e2_", + "1e+", + + "\xa0", + "€", + "0b12", + "0b1_2", + "0b2", + "0b1_", + "0b", + "0o18", + "0o1_8", + "0o8", + "0o1_", + "0o", + "0x1_", + "0x", + "1_", + "012", + "1.2_", + "1e2_", + "1e+", + "'sdfsdf", + "'''sdfsdf''", + "("*1000+"a"+")"*1000, + "]", + """\ + f'__{ + x:d + }__'""", + " a\n\x00", + ]: + with self.subTest(case=case): + self.assertRaises(tokenize.TokenError, get_tokens, case) + + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: IndentationError not raised by + @support.skip_wasi_stack_overflow() + def test_max_indent(self): + MAXINDENT = 100 + + def generate_source(indents): + source = ''.join((' ' * x) + 'if True:\n' for x in range(indents)) + source += ' ' * indents + 'pass\n' + return source + + valid = generate_source(MAXINDENT - 1) + the_input = StringIO(valid) + tokens = list(tokenize._generate_tokens_from_c_tokenizer(the_input.readline)) + self.assertEqual(tokens[-2].type, tokenize.DEDENT) + self.assertEqual(tokens[-1].type, tokenize.ENDMARKER) + compile(valid, "", "exec") + + invalid = generate_source(MAXINDENT) + the_input = StringIO(invalid) + self.assertRaises(IndentationError, lambda: list(tokenize._generate_tokens_from_c_tokenizer(the_input.readline))) + self.assertRaises( + IndentationError, compile, invalid, "", "exec" + ) + + @unittest.expectedFailure # TODO: RUSTPYTHON; (0, '')] + def test_continuation_lines_indentation(self): + def get_tokens(string): + the_string = StringIO(string) + return [(kind, string) for (kind, string, *_) + in tokenize._generate_tokens_from_c_tokenizer(the_string.readline)] + + code = dedent(""" + def fib(n): + \\ + '''Print a Fibonacci series up to n.''' + \\ + a, b = 0, 1 + """) + + self.check_tokenize(code, """\ + NAME 'def' (2, 0) (2, 3) + NAME 'fib' (2, 4) (2, 7) + LPAR '(' (2, 7) (2, 8) + NAME 'n' (2, 8) (2, 9) + RPAR ')' (2, 9) (2, 10) + COLON ':' (2, 10) (2, 11) + NEWLINE '' (2, 11) (2, 11) + INDENT '' (4, -1) (4, -1) + STRING "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39) + NEWLINE '' (4, 39) (4, 39) + NAME 'a' (6, 0) (6, 1) + COMMA ',' (6, 1) (6, 2) + NAME 'b' (6, 3) (6, 4) + EQUAL '=' (6, 5) (6, 6) + NUMBER '0' (6, 7) (6, 8) + COMMA ',' (6, 8) (6, 9) + NUMBER '1' (6, 10) (6, 11) + NEWLINE '' (6, 11) (6, 11) + DEDENT '' (6, -1) (6, -1) + """) + + code_no_cont = dedent(""" + def fib(n): + '''Print a Fibonacci series up to n.''' + a, b = 0, 1 + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + code = dedent(""" + pass + \\ + + pass + """) + + self.check_tokenize(code, """\ + NAME 'pass' (2, 0) (2, 4) + NEWLINE '' (2, 4) (2, 4) + NAME 'pass' (5, 0) (5, 4) + NEWLINE '' (5, 4) (5, 4) + """) + + code_no_cont = dedent(""" + pass + pass + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + code = dedent(""" + if x: + y = 1 + \\ + \\ + \\ + \\ + foo = 1 + """) + + self.check_tokenize(code, """\ + NAME 'if' (2, 0) (2, 2) + NAME 'x' (2, 3) (2, 4) + COLON ':' (2, 4) (2, 5) + NEWLINE '' (2, 5) (2, 5) + INDENT '' (3, -1) (3, -1) + NAME 'y' (3, 4) (3, 5) + EQUAL '=' (3, 6) (3, 7) + NUMBER '1' (3, 8) (3, 9) + NEWLINE '' (3, 9) (3, 9) + NAME 'foo' (8, 4) (8, 7) + EQUAL '=' (8, 8) (8, 9) + NUMBER '1' (8, 10) (8, 11) + NEWLINE '' (8, 11) (8, 11) + DEDENT '' (8, -1) (8, -1) + """) + + code_no_cont = dedent(""" + if x: + y = 1 + foo = 1 + """) + + self.assertEqual(get_tokens(code), get_tokens(code_no_cont)) + + +class CTokenizerBufferTests(unittest.TestCase): + def test_newline_at_the_end_of_buffer(self): + # See issue 99581: Make sure that if we need to add a new line at the + # end of the buffer, we have enough space in the buffer, specially when + # the current line is as long as the buffer space available. + test_script = f"""\ + #coding: latin-1 + #{"a"*10000} + #{"a"*10002}""" + with os_helper.temp_dir() as temp_dir: + file_name = make_script(temp_dir, 'foo', test_script) + run_test_script(file_name) + + +class CommandLineTest(unittest.TestCase): + def setUp(self): + self.filename = tempfile.mktemp() + self.addCleanup(os_helper.unlink, self.filename) + + @staticmethod + def text_normalize(string): + """Dedent *string* and strip it from its surrounding whitespaces. + + This method is used by the other utility functions so that any + string to write or to match against can be freely indented. + """ + return re.sub(r'\s+', ' ', string).strip() + + def set_source(self, content): + with open(self.filename, 'w') as fp: + fp.write(content) + + def invoke_tokenize(self, *flags): + output = StringIO() + with contextlib.redirect_stdout(output): + tokenize._main(args=[*flags, self.filename]) + return self.text_normalize(output.getvalue()) + + def check_output(self, source, expect, *flags): + with self.subTest(source=source, flags=flags): + self.set_source(source) + res = self.invoke_tokenize(*flags) + expect = self.text_normalize(expect) + self.assertListEqual(res.splitlines(), expect.splitlines()) + + def test_invocation(self): + # test various combinations of parameters + base_flags = ('-e', '--exact') + + self.set_source(''' + def f(): + print(x) + return None + ''') + + for flag in base_flags: + with self.subTest(args=flag): + _ = self.invoke_tokenize(flag) + + with self.assertRaises(SystemExit): + # suppress argparse error message + with contextlib.redirect_stderr(StringIO()): + _ = self.invoke_tokenize('--unknown') + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_without_flag(self): + # test 'python -m tokenize source.py' + source = 'a = 1' + expect = ''' + 0,0-0,0: ENCODING 'utf-8' + 1,0-1,1: NAME 'a' + 1,2-1,3: OP '=' + 1,4-1,5: NUMBER '1' + 1,5-1,6: NEWLINE '' + 2,0-2,0: ENDMARKER '' + ''' + self.check_output(source, expect) + + @unittest.expectedFailure # TODO: RUSTPYTHON + def test_exact_flag(self): + # test 'python -m tokenize -e/--exact source.py' + source = 'a = 1' + expect = ''' + 0,0-0,0: ENCODING 'utf-8' + 1,0-1,1: NAME 'a' + 1,2-1,3: EQUAL '=' + 1,4-1,5: NUMBER '1' + 1,5-1,6: NEWLINE '' + 2,0-2,0: ENDMARKER '' + ''' + for flag in ['-e', '--exact']: + self.check_output(source, expect, flag) + + +class StringPrefixTest(unittest.TestCase): + @staticmethod + def determine_valid_prefixes(): + # Try all lengths until we find a length that has zero valid + # prefixes. This will miss the case where for example there + # are no valid 3 character prefixes, but there are valid 4 + # character prefixes. That seems unlikely. + + single_char_valid_prefixes = set() + + # Find all of the single character string prefixes. Just get + # the lowercase version, we'll deal with combinations of upper + # and lower case later. I'm using this logic just in case + # some uppercase-only prefix is added. + for letter in itertools.chain(string.ascii_lowercase, string.ascii_uppercase): + try: + eval(f'{letter}""') + single_char_valid_prefixes.add(letter.lower()) + except SyntaxError: + pass + + # This logic assumes that all combinations of valid prefixes only use + # the characters that are valid single character prefixes. That seems + # like a valid assumption, but if it ever changes this will need + # adjusting. + valid_prefixes = set() + for length in itertools.count(): + num_at_this_length = 0 + for prefix in ( + "".join(l) + for l in itertools.combinations(single_char_valid_prefixes, length) + ): + for t in itertools.permutations(prefix): + for u in itertools.product(*[(c, c.upper()) for c in t]): + p = "".join(u) + if p == "not": + # 'not' can never be a string prefix, + # because it's a valid expression: not "" + continue + try: + eval(f'{p}""') + + # No syntax error, so p is a valid string + # prefix. + + valid_prefixes.add(p) + num_at_this_length += 1 + except SyntaxError: + pass + if num_at_this_length == 0: + return valid_prefixes + + + def test_prefixes(self): + # Get the list of defined string prefixes. I don't see an + # obvious documented way of doing this, but probably the best + # thing is to split apart tokenize.StringPrefix. + + # Make sure StringPrefix begins and ends in parens. We're + # assuming it's of the form "(a|b|ab)", if a, b, and cd are + # valid string prefixes. + self.assertEqual(tokenize.StringPrefix[0], '(') + self.assertEqual(tokenize.StringPrefix[-1], ')') + + # Then split apart everything else by '|'. + defined_prefixes = set(tokenize.StringPrefix[1:-1].split('|')) + + # Now compute the actual allowed string prefixes and compare + # to what is defined in the tokenize module. + self.assertEqual(defined_prefixes, self.determine_valid_prefixes()) + + if __name__ == "__main__": unittest.main() diff --git a/Lib/tokenize.py b/Lib/tokenize.py index d72968e4250..1f31258ce36 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -24,10 +24,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 'Michael Foord') -try: - from builtins import open as _builtin_open -except ImportError: - pass +from builtins import open as _builtin_open from codecs import lookup, BOM_UTF8 import collections import functools @@ -37,13 +34,14 @@ import sys from token import * from token import EXACT_TOKEN_TYPES +import _tokenize -cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) +cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", - "untokenize", "TokenInfo"] + "untokenize", "TokenInfo", "open", "TokenError"] del token class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): @@ -88,7 +86,7 @@ def _all_string_prefixes(): # The valid string prefixes. Only contain the lower case versions, # and don't contain any permutations (include 'fr', but not # 'rf'). The various permutations will be generated. - _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] + _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr'] # if we add binary f-strings, add: ['fb', 'fbr'] result = {''} for prefix in _valid_string_prefixes: @@ -134,7 +132,7 @@ def _compile(expr): group("'", r'\\\r?\n'), StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) -PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) +PseudoExtras = group(r'\\\r?\n|\z', Comment, Triple) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) # For a given string prefix plus quotes, endpats maps it to a regex @@ -146,6 +144,7 @@ def _compile(expr): endpats[_prefix + '"'] = Double endpats[_prefix + "'''"] = Single3 endpats[_prefix + '"""'] = Double3 +del _prefix # A set of all of the single and triple quoted string prefixes, # including the opening quotes. @@ -156,13 +155,12 @@ def _compile(expr): single_quoted.add(u) for u in (t + '"""', t + "'''"): triple_quoted.add(u) +del t, u tabsize = 8 class TokenError(Exception): pass -class StopTokenizing(Exception): pass - class Untokenizer: @@ -170,6 +168,8 @@ def __init__(self): self.tokens = [] self.prev_row = 1 self.prev_col = 0 + self.prev_type = None + self.prev_line = "" self.encoding = None def add_whitespace(self, start): @@ -177,14 +177,51 @@ def add_whitespace(self, start): if row < self.prev_row or row == self.prev_row and col < self.prev_col: raise ValueError("start ({},{}) precedes previous end ({},{})" .format(row, col, self.prev_row, self.prev_col)) - row_offset = row - self.prev_row - if row_offset: - self.tokens.append("\\\n" * row_offset) - self.prev_col = 0 + self.add_backslash_continuation(start) col_offset = col - self.prev_col if col_offset: self.tokens.append(" " * col_offset) + def add_backslash_continuation(self, start): + """Add backslash continuation characters if the row has increased + without encountering a newline token. + + This also inserts the correct amount of whitespace before the backslash. + """ + row = start[0] + row_offset = row - self.prev_row + if row_offset == 0: + return + + newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n' + line = self.prev_line.rstrip('\\\r\n') + ws = ''.join(_itertools.takewhile(str.isspace, reversed(line))) + self.tokens.append(ws + f"\\{newline}" * row_offset) + self.prev_col = 0 + + def escape_brackets(self, token): + characters = [] + consume_until_next_bracket = False + for character in token: + if character == "}": + if consume_until_next_bracket: + consume_until_next_bracket = False + else: + characters.append(character) + if character == "{": + n_backslashes = sum( + 1 for char in _itertools.takewhile( + "\\".__eq__, + characters[-2::-1] + ) + ) + if n_backslashes % 2 == 0 or characters[-1] != "N": + characters.append(character) + else: + consume_until_next_bracket = True + characters.append(character) + return "".join(characters) + def untokenize(self, iterable): it = iter(iterable) indents = [] @@ -214,12 +251,22 @@ def untokenize(self, iterable): self.tokens.append(indent) self.prev_col = len(indent) startline = False + elif tok_type in {FSTRING_MIDDLE, TSTRING_MIDDLE}: + if '{' in token or '}' in token: + token = self.escape_brackets(token) + last_line = token.splitlines()[-1] + end_line, end_col = end + extra_chars = last_line.count("{{") + last_line.count("}}") + end = (end_line, end_col + extra_chars) + self.add_whitespace(start) self.tokens.append(token) self.prev_row, self.prev_col = end if tok_type in (NEWLINE, NL): self.prev_row += 1 self.prev_col = 0 + self.prev_type = tok_type + self.prev_line = line return "".join(self.tokens) def compat(self, token, iterable): @@ -227,6 +274,7 @@ def compat(self, token, iterable): toks_append = self.tokens.append startline = token[0] in (NEWLINE, NL) prevstring = False + in_fstring_or_tstring = 0 for tok in _itertools.chain([token], iterable): toknum, tokval = tok[:2] @@ -245,6 +293,10 @@ def compat(self, token, iterable): else: prevstring = False + if toknum in {FSTRING_START, TSTRING_START}: + in_fstring_or_tstring += 1 + elif toknum in {FSTRING_END, TSTRING_END}: + in_fstring_or_tstring -= 1 if toknum == INDENT: indents.append(tokval) continue @@ -256,7 +308,19 @@ def compat(self, token, iterable): elif startline and indents: toks_append(indents[-1]) startline = False + elif toknum in {FSTRING_MIDDLE, TSTRING_MIDDLE}: + tokval = self.escape_brackets(tokval) + + # Insert a space between two consecutive brackets if we are in an f-string or t-string + if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring_or_tstring: + tokval = ' ' + tokval + + # Insert a space between two consecutive f-strings + if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): + self.tokens.append(" ") + toks_append(tokval) + self.prev_type = toknum def untokenize(iterable): @@ -268,16 +332,10 @@ def untokenize(iterable): with at least two elements, a token number and token value. If only two tokens are passed, the resulting output is poor. - Round-trip invariant for full input: - Untokenized source will match input source exactly - - Round-trip invariant for limited input: - # Output bytes will tokenize back to the input - t1 = [tok[:2] for tok in tokenize(f.readline)] - newcode = untokenize(t1) - readline = BytesIO(newcode).readline - t2 = [tok[:2] for tok in tokenize(readline)] - assert t1 == t2 + The result is guaranteed to tokenize back to match the input so + that the conversion is lossless and round-trips are assured. + The guarantee applies only to the token type and token string as + the spacing between tokens (column positions) may change. """ ut = Untokenizer() out = ut.untokenize(iterable) @@ -287,7 +345,7 @@ def untokenize(iterable): def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" + """Imitates get_normal_name in Parser/tokenizer/helpers.c.""" # Only care about the first 12 characters. enc = orig_enc[:12].lower().replace("_", "-") if enc == "utf-8" or enc.startswith("utf-8-"): @@ -327,22 +385,23 @@ def read_or_stop(): except StopIteration: return b'' - def find_cookie(line): + def check(line, encoding): + # Check if the line matches the encoding. + if 0 in line: + raise SyntaxError("source code cannot contain null bytes") try: - # Decode as UTF-8. Either the line is an encoding declaration, - # in which case it should be pure ASCII, or it must be UTF-8 - # per default encoding. - line_string = line.decode('utf-8') + line.decode(encoding) except UnicodeDecodeError: msg = "invalid or missing encoding declaration" if filename is not None: msg = '{} for {!r}'.format(msg, filename) raise SyntaxError(msg) - match = cookie_re.match(line_string) + def find_cookie(line): + match = cookie_re.match(line) if not match: return None - encoding = _get_normal_name(match.group(1)) + encoding = _get_normal_name(match.group(1).decode()) try: codec = lookup(encoding) except LookupError: @@ -375,18 +434,23 @@ def find_cookie(line): encoding = find_cookie(first) if encoding: + check(first, encoding) return encoding, [first] if not blank_re.match(first): + check(first, default) return default, [first] second = read_or_stop() if not second: + check(first, default) return default, [first] encoding = find_cookie(second) if encoding: + check(first + second, encoding) return encoding, [first, second] + check(first + second, default) return default, [first, second] @@ -405,7 +469,6 @@ def open(filename): buffer.close() raise - def tokenize(readline): """ The tokenize() generator requires one argument, readline, which @@ -426,193 +489,13 @@ def tokenize(readline): which tells you which encoding was used to decode the bytes stream. """ encoding, consumed = detect_encoding(readline) - empty = _itertools.repeat(b"") - rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) - return _tokenize(rl_gen.__next__, encoding) - - -def _tokenize(readline, encoding): - lnum = parenlev = continued = 0 - numchars = '0123456789' - contstr, needcont = '', 0 - contline = None - indents = [0] - + rl_gen = _itertools.chain(consumed, iter(readline, b"")) if encoding is not None: if encoding == "utf-8-sig": # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - last_line = b'' - line = b'' - while True: # loop over lines in stream - try: - # We capture the value of the line variable here because - # readline uses the empty string '' to signal end of input, - # hence `line` itself will always be overwritten at the end - # of this loop. - last_line = line - line = readline() - except StopIteration: - line = b'' - - if encoding is not None: - line = line.decode(encoding) - lnum += 1 - pos, max = 0, len(line) - - if contstr: # continued string - if not line: - raise TokenError("EOF in multi-line string", strstart) - endmatch = endprog.match(line) - if endmatch: - pos = end = endmatch.end(0) - yield TokenInfo(STRING, contstr + line[:end], - strstart, (lnum, end), contline + line) - contstr, needcont = '', 0 - contline = None - elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': - yield TokenInfo(ERRORTOKEN, contstr + line, - strstart, (lnum, len(line)), contline) - contstr = '' - contline = None - continue - else: - contstr = contstr + line - contline = contline + line - continue - - elif parenlev == 0 and not continued: # new statement - if not line: break - column = 0 - while pos < max: # measure leading whitespace - if line[pos] == ' ': - column += 1 - elif line[pos] == '\t': - column = (column//tabsize + 1)*tabsize - elif line[pos] == '\f': - column = 0 - else: - break - pos += 1 - if pos == max: - break - - if line[pos] in '#\r\n': # skip comments or blank lines - if line[pos] == '#': - comment_token = line[pos:].rstrip('\r\n') - yield TokenInfo(COMMENT, comment_token, - (lnum, pos), (lnum, pos + len(comment_token)), line) - pos += len(comment_token) - - yield TokenInfo(NL, line[pos:], - (lnum, pos), (lnum, len(line)), line) - continue - - if column > indents[-1]: # count indents or dedents - indents.append(column) - yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) - while column < indents[-1]: - if column not in indents: - raise IndentationError( - "unindent does not match any outer indentation level", - ("", lnum, pos, line)) - indents = indents[:-1] - - yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) - - else: # continued statement - if not line: - raise TokenError("EOF in multi-line statement", (lnum, 0)) - continued = 0 - - while pos < max: - pseudomatch = _compile(PseudoToken).match(line, pos) - if pseudomatch: # scan for tokens - start, end = pseudomatch.span(1) - spos, epos, pos = (lnum, start), (lnum, end), end - if start == end: - continue - token, initial = line[start:end], line[start] - - if (initial in numchars or # ordinary number - (initial == '.' and token != '.' and token != '...')): - yield TokenInfo(NUMBER, token, spos, epos, line) - elif initial in '\r\n': - if parenlev > 0: - yield TokenInfo(NL, token, spos, epos, line) - else: - yield TokenInfo(NEWLINE, token, spos, epos, line) - - elif initial == '#': - assert not token.endswith("\n") - yield TokenInfo(COMMENT, token, spos, epos, line) - - elif token in triple_quoted: - endprog = _compile(endpats[token]) - endmatch = endprog.match(line, pos) - if endmatch: # all on one line - pos = endmatch.end(0) - token = line[start:pos] - yield TokenInfo(STRING, token, spos, (lnum, pos), line) - else: - strstart = (lnum, start) # multiple lines - contstr = line[start:] - contline = line - break - - # Check up to the first 3 chars of the token to see if - # they're in the single_quoted set. If so, they start - # a string. - # We're using the first 3, because we're looking for - # "rb'" (for example) at the start of the token. If - # we switch to longer prefixes, this needs to be - # adjusted. - # Note that initial == token[:1]. - # Also note that single quote checking must come after - # triple quote checking (above). - elif (initial in single_quoted or - token[:2] in single_quoted or - token[:3] in single_quoted): - if token[-1] == '\n': # continued string - strstart = (lnum, start) - # Again, using the first 3 chars of the - # token. This is looking for the matching end - # regex for the correct type of quote - # character. So it's really looking for - # endpats["'"] or endpats['"'], by trying to - # skip string prefix characters, if any. - endprog = _compile(endpats.get(initial) or - endpats.get(token[1]) or - endpats.get(token[2])) - contstr, needcont = line[start:], 1 - contline = line - break - else: # ordinary string - yield TokenInfo(STRING, token, spos, epos, line) - - elif initial.isidentifier(): # ordinary name - yield TokenInfo(NAME, token, spos, epos, line) - elif initial == '\\': # continued stmt - continued = 1 - else: - if initial in '([{': - parenlev += 1 - elif initial in ')]}': - parenlev -= 1 - yield TokenInfo(OP, token, spos, epos, line) - else: - yield TokenInfo(ERRORTOKEN, line[pos], - (lnum, pos), (lnum, pos+1), line) - pos += 1 - - # Add an implicit NEWLINE if the input doesn't end in one - if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): - yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') - for indent in indents[1:]: # pop remaining indent levels - yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') - yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') - + yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True) def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. @@ -620,9 +503,9 @@ def generate_tokens(readline): This has the same API as tokenize(), except that it expects the *readline* callable to return str objects instead of bytes. """ - return _tokenize(readline, None) + return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True) -def main(): +def _main(args=None): import argparse # Helper error handling routines @@ -641,13 +524,13 @@ def error(message, filename=None, location=None): sys.exit(1) # Parse the arguments and options - parser = argparse.ArgumentParser(prog='python -m tokenize') + parser = argparse.ArgumentParser(color=True) parser.add_argument(dest='filename', nargs='?', metavar='filename.py', help='the file to tokenize; defaults to stdin') parser.add_argument('-e', '--exact', dest='exact', action='store_true', help='display token names using the exact type') - args = parser.parse_args() + args = parser.parse_args(args) try: # Tokenize the input @@ -657,7 +540,9 @@ def error(message, filename=None, location=None): tokens = list(tokenize(f.readline)) else: filename = "" - tokens = _tokenize(sys.stdin.readline, None) + tokens = _generate_tokens_from_c_tokenizer( + sys.stdin.readline, extra_tokens=True) + # Output the tokenization for token in tokens: @@ -683,5 +568,31 @@ def error(message, filename=None, location=None): perror("unexpected error: %s" % err) raise +def _transform_msg(msg): + """Transform error messages from the C tokenizer into the Python tokenize + + The C tokenizer is more picky than the Python one, so we need to massage + the error messages a bit for backwards compatibility. + """ + if "unterminated triple-quoted string literal" in msg: + return "EOF in multi-line string" + return msg + +def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False): + """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" + if encoding is None: + it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens) + else: + it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens) + try: + for info in it: + yield TokenInfo._make(info) + except SyntaxError as e: + if type(e) != SyntaxError: + raise e from None + msg = _transform_msg(e.msg) + raise TokenError(msg, (e.lineno, e.offset)) from None + + if __name__ == "__main__": - main() + _main() From 55737ede635ba35bcc60e61420bb3a5a0b5295ec Mon Sep 17 00:00:00 2001 From: "Jeong, YunWon" Date: Mon, 9 Mar 2026 11:55:31 +0900 Subject: [PATCH 3/3] Rewrite _tokenize with 2-phase model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace per-line reparsing with single-pass tokenization: - Read all lines via readline, parse once, yield tokens - Fix token type values (COMMENT=65, NL=66, OP=55) - Fix NEWLINE/NL end positions and implicit newline handling - Fix DEDENT positions via look-ahead to next non-DEDENT token - Handle FSTRING_MIDDLE brace unescaping ({{ → {, }} → }) - Emit implicit NL before ENDMARKER when source lacks trailing newline - Raise IndentationError from lexer errors - Remove 13 expectedFailure marks for now-passing tests --- Lib/test/test_tabnanny.py | 1 - Lib/test/test_tokenize.py | 13 - crates/stdlib/src/_tokenize.rs | 747 +++++++++++++++++++++++++++++++++ crates/stdlib/src/lib.rs | 5 +- crates/stdlib/src/tokenize.rs | 391 ----------------- 5 files changed, 750 insertions(+), 407 deletions(-) create mode 100644 crates/stdlib/src/_tokenize.rs delete mode 100644 crates/stdlib/src/tokenize.rs diff --git a/Lib/test/test_tabnanny.py b/Lib/test/test_tabnanny.py index 372be9eb8c3..d7a77eb26e4 100644 --- a/Lib/test/test_tabnanny.py +++ b/Lib/test/test_tabnanny.py @@ -316,7 +316,6 @@ def validate_cmd(self, *args, stdout="", stderr="", partial=False, expect_failur self.assertListEqual(out.splitlines(), stdout.splitlines()) self.assertListEqual(err.splitlines(), stderr.splitlines()) - @unittest.expectedFailure # TODO: RUSTPYTHON; Should displays error when errored python file is given. def test_with_errored_file(self): """Should displays error when errored python file is given.""" with TemporaryPyFile(SOURCE_CODES["wrong_indented"]) as file_path: diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index c10f80a723c..394a87c3601 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1167,7 +1167,6 @@ async def bar(): pass DEDENT '' (7, 0) (7, 0) """) - @unittest.expectedFailure # TODO: RUSTPYTHON; + " NEWLINE '\\n' (4, 1) (4, 2)"] def test_newline_after_parenthesized_block_with_comment(self): self.check_tokenize('''\ [ @@ -1192,7 +1191,6 @@ def test_closing_parenthesis_from_different_line(self): NAME 'x' (1, 3) (1, 4) """) - @unittest.expectedFailure # TODO: RUSTPYTHON; ' FSTRING_END "\'\'\'" (2, 68) (2, 71)'] def test_multiline_non_ascii_fstring(self): self.check_tokenize("""\ a = f''' @@ -1204,7 +1202,6 @@ def test_multiline_non_ascii_fstring(self): FSTRING_END "\'\'\'" (2, 68) (2, 71) """) - @unittest.expectedFailure # TODO: RUSTPYTHON; Diff is 696 characters long. Set self.maxDiff to None to see it. def test_multiline_non_ascii_fstring_with_expr(self): self.check_tokenize("""\ f''' @@ -2176,7 +2173,6 @@ def test_string_concatenation(self): # Two string literals on the same line self.check_roundtrip("'' ''") - @unittest.expectedFailure # TODO: RUSTPYTHON def test_random_files(self): # Test roundtrip on random python modules. # pass the '-ucpu' option to process the full directory. @@ -2214,7 +2210,6 @@ def test_indentation_semantics_retained(self): class InvalidPythonTests(TestCase): - @unittest.expectedFailure # TODO: RUSTPYTHON; Diff is 1046 characters long. Set self.maxDiff to None to see it. def test_number_followed_by_name(self): # See issue #gh-105549 source = "2sin(x)" @@ -2254,7 +2249,6 @@ def check_tokenize(self, s, expected): ) self.assertEqual(result, expected.rstrip().splitlines()) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_encoding(self): def readline(encoding): yield "1+1".encode(encoding) @@ -2386,7 +2380,6 @@ def test_float(self): NUMBER '3.14e159' (1, 4) (1, 12) """) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_string(self): self.check_tokenize('x = \'\'; y = ""', """\ @@ -2818,7 +2811,6 @@ def test_unary(self): NUMBER '1' (1, 22) (1, 23) """) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_selector(self): self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ @@ -2841,7 +2833,6 @@ def test_selector(self): RPAR ')' (2, 29) (2, 30) """) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_method(self): self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ @@ -2859,7 +2850,6 @@ def test_method(self): NAME 'pass' (2, 14) (2, 18) """) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_tabs(self): self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\ @@ -3144,7 +3134,6 @@ async def bar(): pass DEDENT '' (6, -1) (6, -1) """) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_unicode(self): self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ @@ -3394,7 +3383,6 @@ def f(): with contextlib.redirect_stderr(StringIO()): _ = self.invoke_tokenize('--unknown') - @unittest.expectedFailure # TODO: RUSTPYTHON def test_without_flag(self): # test 'python -m tokenize source.py' source = 'a = 1' @@ -3408,7 +3396,6 @@ def test_without_flag(self): ''' self.check_output(source, expect) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_exact_flag(self): # test 'python -m tokenize -e/--exact source.py' source = 'a = 1' diff --git a/crates/stdlib/src/_tokenize.rs b/crates/stdlib/src/_tokenize.rs new file mode 100644 index 00000000000..13e40ff12b0 --- /dev/null +++ b/crates/stdlib/src/_tokenize.rs @@ -0,0 +1,747 @@ +pub(crate) use _tokenize::module_def; + +#[pymodule] +mod _tokenize { + use crate::{ + common::lock::PyRwLock, + vm::{ + AsObject, Py, PyObjectRef, PyPayload, PyResult, VirtualMachine, + builtins::{PyBytes, PyStr, PyType}, + convert::ToPyObject, + function::ArgCallable, + protocol::PyIterReturn, + types::{Constructor, IterNext, Iterable, SelfIter}, + }, + }; + use ruff_python_ast::PySourceType; + use ruff_python_ast::token::{Token, TokenKind}; + use ruff_python_parser::{ + LexicalErrorType, ParseError, ParseErrorType, parse_unchecked_source, + }; + use ruff_source_file::{LineIndex, LineRanges}; + use ruff_text_size::{Ranged, TextSize}; + use core::fmt; + + const TOKEN_ENDMARKER: u8 = 0; + const TOKEN_DEDENT: u8 = 6; + const TOKEN_OP: u8 = 55; + const TOKEN_COMMENT: u8 = 65; + const TOKEN_NL: u8 = 66; + + #[pyattr] + #[pyclass(name = "TokenizerIter")] + #[derive(PyPayload)] + pub struct PyTokenizerIter { + readline: ArgCallable, + extra_tokens: bool, + encoding: Option, + state: PyRwLock, + } + + impl PyTokenizerIter { + fn readline(&self, vm: &VirtualMachine) -> PyResult { + let raw_line = match self.readline.invoke((), vm) { + Ok(v) => v, + Err(err) => { + if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) { + return Ok(String::new()); + } + return Err(err); + } + }; + Ok(match &self.encoding { + Some(encoding) => { + let bytes = raw_line + .downcast::() + .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?; + vm.state + .codec_registry + .decode_text(bytes.into(), encoding, None, vm) + .map(|s| s.to_string())? + } + None => raw_line + .downcast::() + .map(|s| s.to_string()) + .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?, + }) + } + } + + impl fmt::Debug for PyTokenizerIter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("PyTokenizerIter") + .field("extra_tokens", &self.extra_tokens) + .field("encoding", &self.encoding) + .finish() + } + } + + #[pyclass(with(Constructor, Iterable, IterNext))] + impl PyTokenizerIter {} + + impl Constructor for PyTokenizerIter { + type Args = PyTokenizerIterArgs; + + fn py_new(_cls: &Py, args: Self::Args, _vm: &VirtualMachine) -> PyResult { + let Self::Args { + readline, + extra_tokens, + encoding, + } = args; + + Ok(Self { + readline, + extra_tokens, + encoding: encoding.map(|s| s.to_string()), + state: PyRwLock::new(TokenizerState { + phase: TokenizerPhase::Reading { + source: String::new(), + }, + }), + }) + } + } + + impl SelfIter for PyTokenizerIter {} + + impl IterNext for PyTokenizerIter { + fn next(zelf: &Py, vm: &VirtualMachine) -> PyResult { + let mut state = zelf.state.read().clone(); + + loop { + match &mut state.phase { + TokenizerPhase::Reading { source } => { + let line = zelf.readline(vm)?; + if line.is_empty() { + let accumulated = core::mem::take(source); + let parsed = + parse_unchecked_source(&accumulated, PySourceType::Python); + let tokens: Vec = parsed.tokens().iter().copied().collect(); + let errors: Vec = parsed.errors().to_vec(); + let line_index = LineIndex::from_source_text(&accumulated); + let implicit_nl = !accumulated.ends_with('\n'); + state.phase = TokenizerPhase::Yielding { + source: accumulated, + tokens, + errors, + index: 0, + line_index, + need_implicit_nl: implicit_nl, + pending_fstring_parts: Vec::new(), + pending_empty_fstring_middle: None, + }; + } else { + source.push_str(&line); + } + } + TokenizerPhase::Yielding { .. } => { + let result = + emit_next_token(&mut state, zelf.extra_tokens, vm)?; + *zelf.state.write() = state; + return Ok(result); + } + TokenizerPhase::Done => { + return Ok(PyIterReturn::StopIteration(None)); + } + } + } + } + } + + /// Emit the next token from the Yielding phase. + fn emit_next_token( + state: &mut TokenizerState, + extra_tokens: bool, + vm: &VirtualMachine, + ) -> PyResult { + let TokenizerPhase::Yielding { + source, + tokens, + errors, + index, + line_index, + need_implicit_nl, + pending_fstring_parts, + pending_empty_fstring_middle, + } = &mut state.phase + else { + unreachable!() + }; + + // Emit pending empty FSTRING_MIDDLE (for format spec nesting) + if let Some((mid_type, mid_line, mid_col, mid_line_str)) = + pending_empty_fstring_middle.take() + { + return Ok(PyIterReturn::Return(make_token_tuple( + vm, + mid_type, + "", + mid_line, + mid_col as isize, + mid_line, + mid_col as isize, + &mid_line_str, + ))); + } + + // Emit any pending fstring sub-tokens first + if let Some((tok_type, tok_str, sl, sc, el, ec)) = pending_fstring_parts.pop() { + let offset: usize = source + .lines() + .take(sl.saturating_sub(1)) + .map(|l| l.len() + 1) + .sum(); + let full_line = + source.full_line_str(TextSize::from(offset.min(source.len()) as u32)); + return Ok(PyIterReturn::Return(make_token_tuple( + vm, tok_type, &tok_str, sl, sc as isize, el, ec as isize, full_line, + ))); + } + + let source_len = TextSize::from(source.len() as u32); + + while *index < tokens.len() { + let token = tokens[*index]; + *index += 1; + let kind = token.kind(); + let range = token.range(); + + // Check for lexical indentation errors. + // Skip when source has tabs — ruff and CPython handle tab + // indentation differently (CPython uses tabsize=8), so ruff may + // report false IndentationErrors for valid mixed-tab code. + if !source.contains('\t') { + for err in errors.iter() { + if !matches!( + err.error, + ParseErrorType::Lexical(LexicalErrorType::IndentationError) + ) { + continue; + } + if err.location.start() <= range.start() + && range.start() < err.location.end() + { + return Err(raise_indentation_error(vm, err, source, line_index)); + } + } + } + + if kind == TokenKind::EndOfFile { + continue; + } + + if !extra_tokens + && matches!(kind, TokenKind::Comment | TokenKind::NonLogicalNewline) + { + continue; + } + + let raw_type = token_kind_value(kind); + let token_type = if extra_tokens && raw_type > TOKEN_DEDENT && raw_type < TOKEN_OP + { + TOKEN_OP + } else { + raw_type + }; + + let (token_str, start_line, start_col, end_line, end_col, line_str) = + if kind == TokenKind::Dedent { + let last_line = source.lines().count(); + let default_pos = if extra_tokens { + (last_line + 1, 0) + } else { + (last_line, 0) + }; + let (pos, dedent_line) = + next_non_dedent_info(tokens, *index, source, line_index, default_pos); + ("", pos.0, pos.1, pos.0, pos.1, dedent_line) + } else { + let start_lc = line_index.line_column(range.start(), source); + let start_line = start_lc.line.get(); + let start_col = start_lc.column.to_zero_indexed(); + let implicit_newline = range.start() >= source_len; + let in_source = range.end() <= source_len; + + let (s, el, ec) = if kind == TokenKind::Newline { + if extra_tokens { + if implicit_newline { + ("", start_line, start_col + 1) + } else { + let s = if source[range].starts_with('\r') { + "\r\n" + } else { + "\n" + }; + (s, start_line, start_col + s.len()) + } + } else { + ("", start_line, start_col) + } + } else if kind == TokenKind::NonLogicalNewline { + let s = if in_source { &source[range] } else { "" }; + (s, start_line, start_col + s.len()) + } else { + let end_lc = line_index.line_column(range.end(), source); + let s = if in_source { &source[range] } else { "" }; + (s, end_lc.line.get(), end_lc.column.to_zero_indexed()) + }; + let line_str = source.full_line_str(range.start()); + (s, start_line, start_col, el, ec, line_str) + }; + + // Handle FSTRING_MIDDLE/TSTRING_MIDDLE brace unescaping + if matches!(kind, TokenKind::FStringMiddle | TokenKind::TStringMiddle) + && (token_str.contains("{{") || token_str.contains("}}")) + { + let mut parts = + split_fstring_middle(token_str, token_type, start_line, start_col) + .into_iter(); + let (tt, ts, sl, sc, el, ec) = parts.next().unwrap(); + let rest: Vec<_> = parts.collect(); + for p in rest.into_iter().rev() { + pending_fstring_parts.push(p); + } + return Ok(PyIterReturn::Return(make_token_tuple( + vm, tt, &ts, sl, sc as isize, el, ec as isize, line_str, + ))); + } + + // After emitting a Rbrace inside an fstring, check if the + // next token is also Rbrace without an intervening FStringMiddle. + // CPython emits an empty FSTRING_MIDDLE in that position. + if kind == TokenKind::Rbrace + && tokens + .get(*index) + .is_some_and(|t| t.kind() == TokenKind::Rbrace) + { + let mid_type = find_fstring_middle_type(tokens, *index); + *pending_empty_fstring_middle = Some(( + mid_type, + end_line, + end_col, + line_str.to_string(), + )); + } + + return Ok(PyIterReturn::Return(make_token_tuple( + vm, token_type, token_str, start_line, start_col as isize, end_line, + end_col as isize, line_str, + ))); + } + + // Emit implicit NL before ENDMARKER if source + // doesn't end with newline and last token is Comment + if extra_tokens && core::mem::take(need_implicit_nl) { + let last_tok = tokens + .iter() + .rev() + .find(|t| t.kind() != TokenKind::EndOfFile); + if let Some(last) = last_tok.filter(|t| t.kind() == TokenKind::Comment) { + let end_lc = line_index.line_column(last.range().end(), source); + let nl_line = end_lc.line.get(); + let nl_col = end_lc.column.to_zero_indexed(); + return Ok(PyIterReturn::Return(make_token_tuple( + vm, + TOKEN_NL, + "", + nl_line, + nl_col as isize, + nl_line, + nl_col as isize + 1, + source.full_line_str(last.range().start()), + ))); + } + } + + // Check for unclosed brackets before ENDMARKER — CPython's tokenizer + // raises SyntaxError("EOF in multi-line statement") in this case. + { + let bracket_count: i32 = tokens + .iter() + .map(|t| match t.kind() { + TokenKind::Lpar | TokenKind::Lsqb | TokenKind::Lbrace => 1, + TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace => -1, + _ => 0, + }) + .sum(); + if bracket_count > 0 { + let last_line = source.lines().count(); + return Err(raise_syntax_error( + vm, + "EOF in multi-line statement", + last_line + 1, + 0, + )); + } + } + + // All tokens consumed — emit ENDMARKER + let last_line = source.lines().count(); + let (em_line, em_col, em_line_str): (usize, isize, &str) = if extra_tokens { + (last_line + 1, 0, "") + } else { + let last_line_text = source.full_line_str(TextSize::from( + source.len().saturating_sub(1) as u32, + )); + (last_line, -1, last_line_text) + }; + + let result = make_token_tuple( + vm, TOKEN_ENDMARKER, "", em_line, em_col, em_line, em_col, em_line_str, + ); + state.phase = TokenizerPhase::Done; + Ok(PyIterReturn::Return(result)) + } + + /// Determine whether to emit FSTRING_MIDDLE (60) or TSTRING_MIDDLE (63) + /// by looking back for the most recent FStringStart/TStringStart. + fn find_fstring_middle_type(tokens: &[Token], index: usize) -> u8 { + let mut depth = 0i32; + for i in (0..index).rev() { + match tokens[i].kind() { + TokenKind::FStringEnd | TokenKind::TStringEnd => depth += 1, + TokenKind::FStringStart => { + if depth == 0 { + return 60; // FSTRING_MIDDLE + } + depth -= 1; + } + TokenKind::TStringStart => { + if depth == 0 { + return 63; // TSTRING_MIDDLE + } + depth -= 1; + } + _ => {} + } + } + 60 // default to FSTRING_MIDDLE + } + + /// Find the next non-DEDENT token's position and source line. + /// Returns ((line, col), line_str). + fn next_non_dedent_info<'a>( + tokens: &[Token], + index: usize, + source: &'a str, + line_index: &LineIndex, + default_pos: (usize, usize), + ) -> ((usize, usize), &'a str) { + for future in &tokens[index..] { + match future.kind() { + TokenKind::Dedent => continue, + TokenKind::EndOfFile => return (default_pos, ""), + _ => { + let flc = line_index.line_column(future.range().start(), source); + let pos = (flc.line.get(), flc.column.to_zero_indexed()); + return (pos, source.full_line_str(future.range().start())); + } + } + } + (default_pos, "") + } + + /// Raise a SyntaxError with the given message and position. + fn raise_syntax_error( + vm: &VirtualMachine, + msg: &str, + lineno: usize, + offset: usize, + ) -> rustpython_vm::builtins::PyBaseExceptionRef { + let exc = vm.new_exception_msg( + vm.ctx.exceptions.syntax_error.to_owned(), + msg.into(), + ); + let obj = exc.as_object(); + let _ = obj.set_attr("msg", vm.ctx.new_str(msg), vm); + let _ = obj.set_attr("lineno", vm.ctx.new_int(lineno), vm); + let _ = obj.set_attr("offset", vm.ctx.new_int(offset), vm); + let _ = obj.set_attr("filename", vm.ctx.new_str(""), vm); + let _ = obj.set_attr("text", vm.ctx.none(), vm); + exc + } + + /// Raise an IndentationError from a parse error. + fn raise_indentation_error( + vm: &VirtualMachine, + err: &ParseError, + source: &str, + line_index: &LineIndex, + ) -> rustpython_vm::builtins::PyBaseExceptionRef { + let err_lc = line_index.line_column(err.location.start(), source); + let err_line_text = source.full_line_str(err.location.start()); + let err_text = err_line_text.trim_end_matches('\n').trim_end_matches('\r'); + let msg = format!("{}", err.error); + let exc = vm.new_exception_msg( + vm.ctx.exceptions.indentation_error.to_owned(), + msg.clone().into(), + ); + let obj = exc.as_object(); + let _ = obj.set_attr("lineno", vm.ctx.new_int(err_lc.line.get()), vm); + let _ = obj.set_attr("offset", vm.ctx.new_int(err_text.len() as i64 + 1), vm); + let _ = obj.set_attr("msg", vm.ctx.new_str(msg), vm); + let _ = obj.set_attr("filename", vm.ctx.new_str(""), vm); + let _ = obj.set_attr("text", vm.ctx.new_str(err_text), vm); + exc + } + + /// Split an FSTRING_MIDDLE/TSTRING_MIDDLE token containing `{{`/`}}` + /// into multiple unescaped sub-tokens. + /// Returns vec of (type, string, start_line, start_col, end_line, end_col). + fn split_fstring_middle( + raw: &str, + token_type: u8, + start_line: usize, + start_col: usize, + ) -> Vec<(u8, String, usize, usize, usize, usize)> { + let mut parts = Vec::new(); + let mut current = String::new(); + // Track source position (line, col) — these correspond to the + // original source positions (with {{ and }} still doubled) + let mut cur_line = start_line; + let mut cur_col = start_col; + // Track the start position of the current accumulating part + let mut part_start_line = cur_line; + let mut part_start_col = cur_col; + let mut chars = raw.chars().peekable(); + + // Compute end position of the current accumulated text + let end_pos = |current: &str, start_line: usize, start_col: usize| -> (usize, usize) { + let mut el = start_line; + let mut ec = start_col; + for ch in current.chars() { + if ch == '\n' { + el += 1; + ec = 0; + } else { + ec += ch.len_utf8(); + } + } + (el, ec) + }; + + while let Some(ch) = chars.next() { + if ch == '{' && chars.peek() == Some(&'{') { + chars.next(); + current.push('{'); + cur_col += 2; // skip both {{ in source + } else if ch == '}' && chars.peek() == Some(&'}') { + chars.next(); + // Flush accumulated text before }} + if !current.is_empty() { + let (el, ec) = end_pos(¤t, part_start_line, part_start_col); + parts.push(( + token_type, + core::mem::take(&mut current), + part_start_line, + part_start_col, + el, + ec, + )); + } + // Emit unescaped '}' at source position of }} + parts.push(( + token_type, + "}".to_string(), + cur_line, + cur_col, + cur_line, + cur_col + 1, + )); + cur_col += 2; // skip both }} in source + part_start_line = cur_line; + part_start_col = cur_col; + } else { + if current.is_empty() { + part_start_line = cur_line; + part_start_col = cur_col; + } + current.push(ch); + if ch == '\n' { + cur_line += 1; + cur_col = 0; + } else { + cur_col += ch.len_utf8(); + } + } + } + + if !current.is_empty() { + let (el, ec) = end_pos(¤t, part_start_line, part_start_col); + parts.push((token_type, current, part_start_line, part_start_col, el, ec)); + } + + parts + } + + #[allow(clippy::too_many_arguments)] + fn make_token_tuple( + vm: &VirtualMachine, + token_type: u8, + string: &str, + start_line: usize, + start_col: isize, + end_line: usize, + end_col: isize, + line: &str, + ) -> PyObjectRef { + vm.ctx + .new_tuple(vec![ + token_type.to_pyobject(vm), + vm.ctx.new_str(string).into(), + vm.ctx + .new_tuple(vec![start_line.to_pyobject(vm), start_col.to_pyobject(vm)]) + .into(), + vm.ctx + .new_tuple(vec![end_line.to_pyobject(vm), end_col.to_pyobject(vm)]) + .into(), + vm.ctx.new_str(line).into(), + ]) + .into() + } + + #[derive(FromArgs)] + pub struct PyTokenizerIterArgs { + #[pyarg(positional)] + readline: ArgCallable, + #[pyarg(named)] + extra_tokens: bool, + #[pyarg(named, optional)] + encoding: Option>, + } + + #[derive(Clone, Debug)] + struct TokenizerState { + phase: TokenizerPhase, + } + + #[derive(Clone, Debug)] + enum TokenizerPhase { + Reading { + source: String, + }, + Yielding { + source: String, + tokens: Vec, + errors: Vec, + index: usize, + line_index: LineIndex, + need_implicit_nl: bool, + /// Pending sub-tokens from FSTRING_MIDDLE splitting + pending_fstring_parts: Vec<(u8, String, usize, usize, usize, usize)>, + /// Pending empty FSTRING_MIDDLE for format spec nesting: + /// (type, line, col, line_str) + pending_empty_fstring_middle: Option<(u8, usize, usize, String)>, + }, + Done, + } + + const fn token_kind_value(kind: TokenKind) -> u8 { + match kind { + TokenKind::EndOfFile => 0, + TokenKind::Name + | TokenKind::For + | TokenKind::In + | TokenKind::Pass + | TokenKind::Class + | TokenKind::And + | TokenKind::Is + | TokenKind::Raise + | TokenKind::True + | TokenKind::False + | TokenKind::Assert + | TokenKind::Try + | TokenKind::While + | TokenKind::Yield + | TokenKind::Lambda + | TokenKind::None + | TokenKind::Not + | TokenKind::Or + | TokenKind::Break + | TokenKind::Continue + | TokenKind::Global + | TokenKind::Nonlocal + | TokenKind::Return + | TokenKind::Except + | TokenKind::Import + | TokenKind::Case + | TokenKind::Match + | TokenKind::Type + | TokenKind::Await + | TokenKind::With + | TokenKind::Del + | TokenKind::Finally + | TokenKind::From + | TokenKind::Def + | TokenKind::If + | TokenKind::Else + | TokenKind::Elif + | TokenKind::As + | TokenKind::Async => 1, + TokenKind::Int | TokenKind::Complex | TokenKind::Float => 2, + TokenKind::String => 3, + TokenKind::Newline => 4, + TokenKind::NonLogicalNewline => TOKEN_NL, + TokenKind::Indent => 5, + TokenKind::Dedent => 6, + TokenKind::Lpar => 7, + TokenKind::Rpar => 8, + TokenKind::Lsqb => 9, + TokenKind::Rsqb => 10, + TokenKind::Colon => 11, + TokenKind::Comma => 12, + TokenKind::Semi => 13, + TokenKind::Plus => 14, + TokenKind::Minus => 15, + TokenKind::Star => 16, + TokenKind::Slash => 17, + TokenKind::Vbar => 18, + TokenKind::Amper => 19, + TokenKind::Less => 20, + TokenKind::Greater => 21, + TokenKind::Equal => 22, + TokenKind::Dot => 23, + TokenKind::Percent => 24, + TokenKind::Lbrace => 25, + TokenKind::Rbrace => 26, + TokenKind::EqEqual => 27, + TokenKind::NotEqual => 28, + TokenKind::LessEqual => 29, + TokenKind::GreaterEqual => 30, + TokenKind::Tilde => 31, + TokenKind::CircumFlex => 32, + TokenKind::LeftShift => 33, + TokenKind::RightShift => 34, + TokenKind::DoubleStar => 35, + TokenKind::PlusEqual => 36, + TokenKind::MinusEqual => 37, + TokenKind::StarEqual => 38, + TokenKind::SlashEqual => 39, + TokenKind::PercentEqual => 40, + TokenKind::AmperEqual => 41, + TokenKind::VbarEqual => 42, + TokenKind::CircumflexEqual => 43, + TokenKind::LeftShiftEqual => 44, + TokenKind::RightShiftEqual => 45, + TokenKind::DoubleStarEqual => 46, + TokenKind::DoubleSlash => 47, + TokenKind::DoubleSlashEqual => 48, + TokenKind::At => 49, + TokenKind::AtEqual => 50, + TokenKind::Rarrow => 51, + TokenKind::Ellipsis => 52, + TokenKind::ColonEqual => 53, + TokenKind::Exclamation => 54, + TokenKind::FStringStart => 59, + TokenKind::FStringMiddle => 60, + TokenKind::FStringEnd => 61, + TokenKind::Comment => TOKEN_COMMENT, + TokenKind::TStringStart => 62, + TokenKind::TStringMiddle => 63, + TokenKind::TStringEnd => 64, + TokenKind::IpyEscapeCommand + | TokenKind::Question + | TokenKind::Unknown => 67, // ERRORTOKEN + } + } +} diff --git a/crates/stdlib/src/lib.rs b/crates/stdlib/src/lib.rs index 04aa623d185..4c06eea9ef4 100644 --- a/crates/stdlib/src/lib.rs +++ b/crates/stdlib/src/lib.rs @@ -49,7 +49,8 @@ mod pystruct; mod random; mod statistics; mod suggestions; -mod tokenize; +#[path = "_tokenize.rs"] +mod _tokenize; // TODO: maybe make this an extension module, if we ever get those // mod re; #[cfg(all(feature = "host_env", not(target_arch = "wasm32")))] @@ -226,7 +227,7 @@ pub fn stdlib_module_defs(ctx: &Context) -> Vec<&'static builtins::PyModuleDef> ssl::module_def(ctx), statistics::module_def(ctx), suggestions::module_def(ctx), - tokenize::module_def(ctx), + _tokenize::module_def(ctx), #[cfg(all(feature = "host_env", unix, not(target_os = "redox")))] syslog::module_def(ctx), #[cfg(all( diff --git a/crates/stdlib/src/tokenize.rs b/crates/stdlib/src/tokenize.rs deleted file mode 100644 index 33667a203ec..00000000000 --- a/crates/stdlib/src/tokenize.rs +++ /dev/null @@ -1,391 +0,0 @@ -pub(crate) use _tokenize::module_def; - -#[pymodule] -mod _tokenize { - use crate::{ - common::lock::PyRwLock, - vm::{ - AsObject, Py, PyPayload, PyResult, VirtualMachine, - builtins::{PyBytes, PyStr, PyType}, - convert::ToPyObject, - function::ArgCallable, - protocol::PyIterReturn, - types::{Constructor, IterNext, Iterable, SelfIter}, - }, - }; - use ruff_python_ast::PySourceType; - use ruff_python_ast::token::{Token, TokenKind, Tokens}; - use ruff_python_parser::{ParseError, parse_unchecked_source}; - use ruff_source_file::{LineIndex, LineRanges}; - use ruff_text_size::{Ranged, TextRange}; - use std::{cmp::Ordering, fmt}; - - /// `__import__("token").OP` - const TOKEN_OP: u8 = 55; - - #[pyattr] - #[pyclass(name = "TokenizerIter")] - #[derive(PyPayload)] - pub struct PyTokenizerIter { - readline: ArgCallable, // TODO: This should be PyObject - extra_tokens: bool, - encoding: Option, - state: PyRwLock, - } - - impl PyTokenizerIter { - fn readline(&self, vm: &VirtualMachine) -> PyResult { - // TODO: When `readline` is PyObject, - // we need to check if it's callable and raise a type error if it's not. - let raw_line = match self.readline.invoke((), vm) { - Ok(v) => v, - Err(err) => { - if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) { - return Ok(String::new()); - } - return Err(err); - } - }; - Ok(match &self.encoding { - Some(encoding) => { - let bytes = raw_line - .downcast::() - .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?; - vm.state - .codec_registry - .decode_text(bytes.into(), encoding, None, vm) - .map(|s| s.to_string())? - } - None => raw_line - .downcast::() - .map(|s| s.to_string()) - .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?, - }) - } - } - - impl fmt::Debug for PyTokenizerIter { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("PyTokenizerIter") - .field("readline", &self.readline) - .field("encoding", &self.encoding) - .field("extra_tokens", &self.extra_tokens) - .finish() - } - } - - #[pyclass(with(Constructor, Iterable, IterNext))] - impl PyTokenizerIter {} - - impl Constructor for PyTokenizerIter { - type Args = PyTokenizerIterArgs; - - fn py_new(_cls: &Py, args: Self::Args, _vm: &VirtualMachine) -> PyResult { - let Self::Args { - readline, - extra_tokens, - encoding, - } = args; - - Ok(Self { - readline, - extra_tokens, - encoding: encoding.map(|s| s.to_string()), - state: PyRwLock::new(PyTokenizerIterState::default()), - }) - } - } - - impl SelfIter for PyTokenizerIter {} - - impl IterNext for PyTokenizerIter { - fn next(zelf: &Py, vm: &VirtualMachine) -> PyResult { - let mut state = { - let guard = zelf.state.read(); - guard.clone() - }; - - if state.eof { - return Ok(PyIterReturn::StopIteration(None)); - } - - let token = loop { - // TODO: Check here for errors. Raise SyntaxError if needed - - if let Some(tok) = state.next_token() { - break tok; - } - - let nline = zelf.readline(vm)?; - if nline.is_empty() { - state.eof = true; - *zelf.state.write() = state.clone(); - - let line_num = &state.start().0; - let out = vm - .ctx - .new_tuple(vec![ - token_kind_value(TokenKind::EndOfFile).to_pyobject(vm), - vm.ctx.new_str("").into(), - vm.ctx - .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)]) - .into(), - vm.ctx - .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)]) - .into(), - vm.ctx.new_str(state.current_line()).into(), - ]) - .into(); - return Ok(PyIterReturn::Return(out)); - } - state.push_line(&nline); - }; - - *zelf.state.write() = state.clone(); - - let token_kind = token.kind(); - let token_value = if zelf.extra_tokens && token_kind.is_operator() { - TOKEN_OP - } else { - token_kind_value(token_kind) - }; - let (start_x, start_y) = &state.start(); - let (end_x, end_y) = &state.end(); - - let mut token_repr = &state.source[state.range()]; - if !zelf.extra_tokens { - token_repr = token_repr.trim(); - } - - let out = vm - .ctx - .new_tuple(vec![ - token_value.to_pyobject(vm), - vm.ctx.new_str(token_repr).into(), - vm.ctx - .new_tuple(vec![start_x.to_pyobject(vm), start_y.to_pyobject(vm)]) - .into(), - vm.ctx - .new_tuple(vec![end_x.to_pyobject(vm), end_y.to_pyobject(vm)]) - .into(), - vm.ctx.new_str(state.current_line()).into(), - ]) - .into(); - Ok(PyIterReturn::Return(out)) - } - } - - #[derive(FromArgs)] - pub struct PyTokenizerIterArgs { - #[pyarg(positional)] - readline: ArgCallable, - #[pyarg(named)] - extra_tokens: bool, - #[pyarg(named, optional)] - encoding: Option>, - } - - #[derive(Clone, Debug)] - struct PyTokenizerIterState { - /// Source code. - source: String, - prev_token: Option, - /// Tokens of `source`. - tokens: Tokens, - /// Errors of `source` - errors: Vec, - /// LineIndex of `source`. - line_index: LineIndex, - /// Marker that says we already emitted EOF, and needs to stop iterating. - eof: bool, - } - - impl PyTokenizerIterState { - fn push_line(&mut self, line: &str) { - self.source.push_str(line); - - let parsed = parse_unchecked_source(&self.source, PySourceType::Python); - self.tokens = parsed.tokens().clone(); - self.errors = parsed.errors().to_vec(); - self.line_index = LineIndex::from_source_text(&self.source); - } - - #[must_use] - fn current_line(&self) -> &str { - let (kind, range): (TokenKind, TextRange) = match self.prev_token { - Some(token) => token.as_tuple(), - None => (TokenKind::Unknown, TextRange::default()), - }; - - match kind { - TokenKind::Newline => self.source.full_line_str(range.start()), - _ => self.source.full_lines_str(range), - } - } - - #[must_use] - fn next_token(&mut self) -> Option { - for token in self.tokens.iter() { - let (kind, range): (TokenKind, TextRange) = token.as_tuple(); - - if matches!(kind, TokenKind::NonLogicalNewline) { - continue; - } - - if matches!(range.ordering(self.range()), Ordering::Greater) { - self.prev_token = Some(*token); - return self.prev_token; - } - } - - None - } - - #[must_use] - fn range(&self) -> TextRange { - match self.prev_token { - Some(token) => token.range(), - None => TextRange::default(), - } - } - - #[must_use] - fn start(&self) -> (usize, usize) { - let lc = self - .line_index - .line_column(self.range().start(), &self.source); - (lc.line.get(), lc.column.to_zero_indexed()) - } - - #[must_use] - fn end(&self) -> (usize, usize) { - let lc = self - .line_index - .line_column(self.range().end(), &self.source); - (lc.line.get(), lc.column.to_zero_indexed()) - } - } - - impl Default for PyTokenizerIterState { - fn default() -> Self { - const SOURCE: &str = ""; - let parsed = parse_unchecked_source(SOURCE, PySourceType::Python); - - Self { - source: SOURCE.to_owned(), - prev_token: None, - tokens: parsed.tokens().clone(), - errors: parsed.errors().to_vec(), - line_index: LineIndex::from_source_text(SOURCE), - eof: false, - } - } - } - - const fn token_kind_value(kind: TokenKind) -> u8 { - match kind { - TokenKind::EndOfFile => 0, - TokenKind::Name - | TokenKind::For - | TokenKind::In - | TokenKind::Pass - | TokenKind::Class - | TokenKind::And - | TokenKind::Is - | TokenKind::Raise - | TokenKind::True - | TokenKind::False - | TokenKind::Assert - | TokenKind::Try - | TokenKind::While - | TokenKind::Yield - | TokenKind::Lambda - | TokenKind::None - | TokenKind::Not - | TokenKind::Or - | TokenKind::Break - | TokenKind::Continue - | TokenKind::Global - | TokenKind::Nonlocal - | TokenKind::Return - | TokenKind::Except - | TokenKind::Import - | TokenKind::Case - | TokenKind::Match - | TokenKind::Type - | TokenKind::Await - | TokenKind::With - | TokenKind::Del - | TokenKind::Finally - | TokenKind::From - | TokenKind::Def - | TokenKind::If - | TokenKind::Else - | TokenKind::Elif - | TokenKind::As - | TokenKind::Async => 1, - TokenKind::Int | TokenKind::Complex | TokenKind::Float => 2, - TokenKind::String => 3, - TokenKind::Newline | TokenKind::NonLogicalNewline => 4, - TokenKind::Indent => 5, - TokenKind::Dedent => 6, - TokenKind::Lpar => 7, - TokenKind::Rpar => 8, - TokenKind::Lsqb => 9, - TokenKind::Rsqb => 10, - TokenKind::Colon => 11, - TokenKind::Comma => 12, - TokenKind::Semi => 13, - TokenKind::Plus => 14, - TokenKind::Minus => 15, - TokenKind::Star => 16, - TokenKind::Slash => 17, - TokenKind::Vbar => 18, - TokenKind::Amper => 19, - TokenKind::Less => 20, - TokenKind::Greater => 21, - TokenKind::Equal => 22, - TokenKind::Dot => 23, - TokenKind::Percent => 24, - TokenKind::Lbrace => 25, - TokenKind::Rbrace => 26, - TokenKind::EqEqual => 27, - TokenKind::NotEqual => 28, - TokenKind::LessEqual => 29, - TokenKind::GreaterEqual => 30, - TokenKind::Tilde => 31, - TokenKind::CircumFlex => 32, - TokenKind::LeftShift => 33, - TokenKind::RightShift => 34, - TokenKind::DoubleStar => 35, - TokenKind::PlusEqual => 36, - TokenKind::MinusEqual => 37, - TokenKind::StarEqual => 38, - TokenKind::SlashEqual => 39, - TokenKind::PercentEqual => 40, - TokenKind::AmperEqual => 41, - TokenKind::VbarEqual => 42, - TokenKind::CircumflexEqual => 43, - TokenKind::LeftShiftEqual => 44, - TokenKind::RightShiftEqual => 45, - TokenKind::DoubleStarEqual => 46, - TokenKind::DoubleSlash => 47, - TokenKind::DoubleSlashEqual => 48, - TokenKind::At => 49, - TokenKind::AtEqual => 50, - TokenKind::Rarrow => 51, - TokenKind::Ellipsis => 52, - TokenKind::ColonEqual => 53, - TokenKind::Exclamation => 54, - TokenKind::FStringStart => 59, - TokenKind::FStringMiddle => 60, - TokenKind::FStringEnd => 61, - TokenKind::Comment => 62, - TokenKind::TStringStart => 62, // 3.14 compatible - TokenKind::TStringMiddle => 63, // 3.14 compatible - TokenKind::TStringEnd => 64, // 3.14 compatible - TokenKind::IpyEscapeCommand | TokenKind::Question => 0, // Ruff's specific - TokenKind::Unknown => 0, - } - } -}