From 5c5aff8fd9729050681697fc81a1c721b2199d92 Mon Sep 17 00:00:00 2001
From: ShaharNaveh <shaharnaveh@users.noreply.github.com>
Date: Fri, 6 Mar 2026 09:47:14 +0900
Subject: [PATCH 1/3] Base implementation of _tokenize module

Port from PR #6240 by ShaharNaveh, adapted to current codebase.
Uses ruff_python_parser for tokenization via TokenizerIter.
---
 Cargo.lock                    |   4 +
 crates/stdlib/Cargo.toml      |   5 +
 crates/stdlib/src/lib.rs      |   2 +
 crates/stdlib/src/tokenize.rs | 391 ++++++++++++++++++++++++++++++++++
 4 files changed, 402 insertions(+)
 create mode 100644 crates/stdlib/src/tokenize.rs
diff --git a/Cargo.lock b/Cargo.lock
index ffece1cb31a..1a291035bb0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3283,6 +3283,10 @@ dependencies = [
  "pkcs8",
  "pymath",
  "rand_core 0.9.5",
+ "ruff_python_ast",
+ "ruff_python_parser",
+ "ruff_source_file",
+ "ruff_text_size",
  "rustix",
  "rustls",
  "rustls-native-certs",
diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml
index a40a5bf24a8..7d230fcc046 100644
--- a/crates/stdlib/Cargo.toml
+++ b/crates/stdlib/Cargo.toml
@@ -31,6 +31,11 @@ rustpython-derive = { workspace = true }
 rustpython-vm = { workspace = true, default-features = false, features = ["compiler"]}
 rustpython-common = { workspace = true }
 
+ruff_python_parser = { workspace = true }
+ruff_python_ast = { workspace = true }
+ruff_text_size = { workspace = true }
+ruff_source_file = { workspace = true }
+
 ahash = { workspace = true }
 ascii = { workspace = true }
 cfg-if = { workspace = true }
diff --git a/crates/stdlib/src/lib.rs b/crates/stdlib/src/lib.rs
index 8c234c22f89..04aa623d185 100644
--- a/crates/stdlib/src/lib.rs
+++ b/crates/stdlib/src/lib.rs
@@ -49,6 +49,7 @@ mod pystruct;
 mod random;
 mod statistics;
 mod suggestions;
+mod tokenize;
 // TODO: maybe make this an extension module, if we ever get those
 // mod re;
 #[cfg(all(feature = "host_env", not(target_arch = "wasm32")))]
@@ -225,6 +226,7 @@ pub fn stdlib_module_defs(ctx: &Context) -> Vec<&'static builtins::PyModuleDef>
         ssl::module_def(ctx),
         statistics::module_def(ctx),
         suggestions::module_def(ctx),
+        tokenize::module_def(ctx),
         #[cfg(all(feature = "host_env", unix, not(target_os = "redox")))]
         syslog::module_def(ctx),
         #[cfg(all(
diff --git a/crates/stdlib/src/tokenize.rs b/crates/stdlib/src/tokenize.rs
new file mode 100644
index 00000000000..33667a203ec
--- /dev/null
+++ b/crates/stdlib/src/tokenize.rs
@@ -0,0 +1,391 @@
+pub(crate) use _tokenize::module_def;
+
+#[pymodule]
+mod _tokenize {
+    use crate::{
+        common::lock::PyRwLock,
+        vm::{
+            AsObject, Py, PyPayload, PyResult, VirtualMachine,
+            builtins::{PyBytes, PyStr, PyType},
+            convert::ToPyObject,
+            function::ArgCallable,
+            protocol::PyIterReturn,
+            types::{Constructor, IterNext, Iterable, SelfIter},
+        },
+    };
+    use ruff_python_ast::PySourceType;
+    use ruff_python_ast::token::{Token, TokenKind, Tokens};
+    use ruff_python_parser::{ParseError, parse_unchecked_source};
+    use ruff_source_file::{LineIndex, LineRanges};
+    use ruff_text_size::{Ranged, TextRange};
+    use std::{cmp::Ordering, fmt};
+
+    /// `__import__("token").OP`
+    const TOKEN_OP: u8 = 55;
+
+    #[pyattr]
+    #[pyclass(name = "TokenizerIter")]
+    #[derive(PyPayload)]
+    pub struct PyTokenizerIter {
+        readline: ArgCallable, // TODO: This should be PyObject
+        extra_tokens: bool,
+        encoding: Option<String>,
+        state: PyRwLock<PyTokenizerIterState>,
+    }
+
+    impl PyTokenizerIter {
+        fn readline(&self, vm: &VirtualMachine) -> PyResult<String> {
+            // TODO: When `readline` is PyObject,
+            // we need to check if it's callable and raise a type error if it's not.
+            let raw_line = match self.readline.invoke((), vm) {
+                Ok(v) => v,
+                Err(err) => {
+                    if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) {
+                        return Ok(String::new());
+                    }
+                    return Err(err);
+                }
+            };
+            Ok(match &self.encoding {
+                Some(encoding) => {
+                    let bytes = raw_line
+                        .downcast::<PyBytes>()
+                        .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?;
+                    vm.state
+                        .codec_registry
+                        .decode_text(bytes.into(), encoding, None, vm)
+                        .map(|s| s.to_string())?
+                }
+                None => raw_line
+                    .downcast::<PyStr>()
+                    .map(|s| s.to_string())
+                    .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?,
+            })
+        }
+    }
+
+    impl fmt::Debug for PyTokenizerIter {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            f.debug_struct("PyTokenizerIter")
+                .field("readline", &self.readline)
+                .field("encoding", &self.encoding)
+                .field("extra_tokens", &self.extra_tokens)
+                .finish()
+        }
+    }
+
+    #[pyclass(with(Constructor, Iterable, IterNext))]
+    impl PyTokenizerIter {}
+
+    impl Constructor for PyTokenizerIter {
+        type Args = PyTokenizerIterArgs;
+
+        fn py_new(_cls: &Py<PyType>, args: Self::Args, _vm: &VirtualMachine) -> PyResult<Self> {
+            let Self::Args {
+                readline,
+                extra_tokens,
+                encoding,
+            } = args;
+
+            Ok(Self {
+                readline,
+                extra_tokens,
+                encoding: encoding.map(|s| s.to_string()),
+                state: PyRwLock::new(PyTokenizerIterState::default()),
+            })
+        }
+    }
+
+    impl SelfIter for PyTokenizerIter {}
+
+    impl IterNext for PyTokenizerIter {
+        fn next(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<PyIterReturn> {
+            let mut state = {
+                let guard = zelf.state.read();
+                guard.clone()
+            };
+
+            if state.eof {
+                return Ok(PyIterReturn::StopIteration(None));
+            }
+
+            let token = loop {
+                // TODO: Check here for errors. Raise SyntaxError if needed
+
+                if let Some(tok) = state.next_token() {
+                    break tok;
+                }
+
+                let nline = zelf.readline(vm)?;
+                if nline.is_empty() {
+                    state.eof = true;
+                    *zelf.state.write() = state.clone();
+
+                    let line_num = &state.start().0;
+                    let out = vm
+                        .ctx
+                        .new_tuple(vec![
+                            token_kind_value(TokenKind::EndOfFile).to_pyobject(vm),
+                            vm.ctx.new_str("").into(),
+                            vm.ctx
+                                .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)])
+                                .into(),
+                            vm.ctx
+                                .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)])
+                                .into(),
+                            vm.ctx.new_str(state.current_line()).into(),
+                        ])
+                        .into();
+                    return Ok(PyIterReturn::Return(out));
+                }
+                state.push_line(&nline);
+            };
+
+            *zelf.state.write() = state.clone();
+
+            let token_kind = token.kind();
+            let token_value = if zelf.extra_tokens && token_kind.is_operator() {
+                TOKEN_OP
+            } else {
+                token_kind_value(token_kind)
+            };
+            let (start_x, start_y) = &state.start();
+            let (end_x, end_y) = &state.end();
+
+            let mut token_repr = &state.source[state.range()];
+            if !zelf.extra_tokens {
+                token_repr = token_repr.trim();
+            }
+
+            let out = vm
+                .ctx
+                .new_tuple(vec![
+                    token_value.to_pyobject(vm),
+                    vm.ctx.new_str(token_repr).into(),
+                    vm.ctx
+                        .new_tuple(vec![start_x.to_pyobject(vm), start_y.to_pyobject(vm)])
+                        .into(),
+                    vm.ctx
+                        .new_tuple(vec![end_x.to_pyobject(vm), end_y.to_pyobject(vm)])
+                        .into(),
+                    vm.ctx.new_str(state.current_line()).into(),
+                ])
+                .into();
+            Ok(PyIterReturn::Return(out))
+        }
+    }
+
+    #[derive(FromArgs)]
+    pub struct PyTokenizerIterArgs {
+        #[pyarg(positional)]
+        readline: ArgCallable,
+        #[pyarg(named)]
+        extra_tokens: bool,
+        #[pyarg(named, optional)]
+        encoding: Option<rustpython_vm::PyRef<PyStr>>,
+    }
+
+    #[derive(Clone, Debug)]
+    struct PyTokenizerIterState {
+        /// Source code.
+        source: String,
+        prev_token: Option<Token>,
+        /// Tokens of `source`.
+        tokens: Tokens,
+        /// Errors of `source`
+        errors: Vec<ParseError>,
+        /// LineIndex of `source`.
+        line_index: LineIndex,
+        /// Marker that says we already emitted EOF, and needs to stop iterating.
+        eof: bool,
+    }
+
+    impl PyTokenizerIterState {
+        fn push_line(&mut self, line: &str) {
+            self.source.push_str(line);
+
+            let parsed = parse_unchecked_source(&self.source, PySourceType::Python);
+            self.tokens = parsed.tokens().clone();
+            self.errors = parsed.errors().to_vec();
+            self.line_index = LineIndex::from_source_text(&self.source);
+        }
+
+        #[must_use]
+        fn current_line(&self) -> &str {
+            let (kind, range): (TokenKind, TextRange) = match self.prev_token {
+                Some(token) => token.as_tuple(),
+                None => (TokenKind::Unknown, TextRange::default()),
+            };
+
+            match kind {
+                TokenKind::Newline => self.source.full_line_str(range.start()),
+                _ => self.source.full_lines_str(range),
+            }
+        }
+
+        #[must_use]
+        fn next_token(&mut self) -> Option<Token> {
+            for token in self.tokens.iter() {
+                let (kind, range): (TokenKind, TextRange) = token.as_tuple();
+
+                if matches!(kind, TokenKind::NonLogicalNewline) {
+                    continue;
+                }
+
+                if matches!(range.ordering(self.range()), Ordering::Greater) {
+                    self.prev_token = Some(*token);
+                    return self.prev_token;
+                }
+            }
+
+            None
+        }
+
+        #[must_use]
+        fn range(&self) -> TextRange {
+            match self.prev_token {
+                Some(token) => token.range(),
+                None => TextRange::default(),
+            }
+        }
+
+        #[must_use]
+        fn start(&self) -> (usize, usize) {
+            let lc = self
+                .line_index
+                .line_column(self.range().start(), &self.source);
+            (lc.line.get(), lc.column.to_zero_indexed())
+        }
+
+        #[must_use]
+        fn end(&self) -> (usize, usize) {
+            let lc = self
+                .line_index
+                .line_column(self.range().end(), &self.source);
+            (lc.line.get(), lc.column.to_zero_indexed())
+        }
+    }
+
+    impl Default for PyTokenizerIterState {
+        fn default() -> Self {
+            const SOURCE: &str = "";
+            let parsed = parse_unchecked_source(SOURCE, PySourceType::Python);
+
+            Self {
+                source: SOURCE.to_owned(),
+                prev_token: None,
+                tokens: parsed.tokens().clone(),
+                errors: parsed.errors().to_vec(),
+                line_index: LineIndex::from_source_text(SOURCE),
+                eof: false,
+            }
+        }
+    }
+
+    const fn token_kind_value(kind: TokenKind) -> u8 {
+        match kind {
+            TokenKind::EndOfFile => 0,
+            TokenKind::Name
+            | TokenKind::For
+            | TokenKind::In
+            | TokenKind::Pass
+            | TokenKind::Class
+            | TokenKind::And
+            | TokenKind::Is
+            | TokenKind::Raise
+            | TokenKind::True
+            | TokenKind::False
+            | TokenKind::Assert
+            | TokenKind::Try
+            | TokenKind::While
+            | TokenKind::Yield
+            | TokenKind::Lambda
+            | TokenKind::None
+            | TokenKind::Not
+            | TokenKind::Or
+            | TokenKind::Break
+            | TokenKind::Continue
+            | TokenKind::Global
+            | TokenKind::Nonlocal
+            | TokenKind::Return
+            | TokenKind::Except
+            | TokenKind::Import
+            | TokenKind::Case
+            | TokenKind::Match
+            | TokenKind::Type
+            | TokenKind::Await
+            | TokenKind::With
+            | TokenKind::Del
+            | TokenKind::Finally
+            | TokenKind::From
+            | TokenKind::Def
+            | TokenKind::If
+            | TokenKind::Else
+            | TokenKind::Elif
+            | TokenKind::As
+            | TokenKind::Async => 1,
+            TokenKind::Int | TokenKind::Complex | TokenKind::Float => 2,
+            TokenKind::String => 3,
+            TokenKind::Newline | TokenKind::NonLogicalNewline => 4,
+            TokenKind::Indent => 5,
+            TokenKind::Dedent => 6,
+            TokenKind::Lpar => 7,
+            TokenKind::Rpar => 8,
+            TokenKind::Lsqb => 9,
+            TokenKind::Rsqb => 10,
+            TokenKind::Colon => 11,
+            TokenKind::Comma => 12,
+            TokenKind::Semi => 13,
+            TokenKind::Plus => 14,
+            TokenKind::Minus => 15,
+            TokenKind::Star => 16,
+            TokenKind::Slash => 17,
+            TokenKind::Vbar => 18,
+            TokenKind::Amper => 19,
+            TokenKind::Less => 20,
+            TokenKind::Greater => 21,
+            TokenKind::Equal => 22,
+            TokenKind::Dot => 23,
+            TokenKind::Percent => 24,
+            TokenKind::Lbrace => 25,
+            TokenKind::Rbrace => 26,
+            TokenKind::EqEqual => 27,
+            TokenKind::NotEqual => 28,
+            TokenKind::LessEqual => 29,
+            TokenKind::GreaterEqual => 30,
+            TokenKind::Tilde => 31,
+            TokenKind::CircumFlex => 32,
+            TokenKind::LeftShift => 33,
+            TokenKind::RightShift => 34,
+            TokenKind::DoubleStar => 35,
+            TokenKind::PlusEqual => 36,
+            TokenKind::MinusEqual => 37,
+            TokenKind::StarEqual => 38,
+            TokenKind::SlashEqual => 39,
+            TokenKind::PercentEqual => 40,
+            TokenKind::AmperEqual => 41,
+            TokenKind::VbarEqual => 42,
+            TokenKind::CircumflexEqual => 43,
+            TokenKind::LeftShiftEqual => 44,
+            TokenKind::RightShiftEqual => 45,
+            TokenKind::DoubleStarEqual => 46,
+            TokenKind::DoubleSlash => 47,
+            TokenKind::DoubleSlashEqual => 48,
+            TokenKind::At => 49,
+            TokenKind::AtEqual => 50,
+            TokenKind::Rarrow => 51,
+            TokenKind::Ellipsis => 52,
+            TokenKind::ColonEqual => 53,
+            TokenKind::Exclamation => 54,
+            TokenKind::FStringStart => 59,
+            TokenKind::FStringMiddle => 60,
+            TokenKind::FStringEnd => 61,
+            TokenKind::Comment => 62,
+            TokenKind::TStringStart => 62,  // 3.14 compatible
+            TokenKind::TStringMiddle => 63, // 3.14 compatible
+            TokenKind::TStringEnd => 64,    // 3.14 compatible
+            TokenKind::IpyEscapeCommand | TokenKind::Question => 0, // Ruff's specific
+            TokenKind::Unknown => 0,
+        }
+    }
+}

From bf2b993c93ad55363aa0b8c95222aab3b58c135e Mon Sep 17 00:00:00 2001
From: CPython Developers <>
Date: Fri, 6 Mar 2026 09:53:33 +0900
Subject: [PATCH 2/3] Update tokenize from v3.14.3

---
 Lib/test/test_tokenize.py | 2150 ++++++++++++++++++++++++++++++++++---
 Lib/tokenize.py           |  353 +++---
 2 files changed, 2126 insertions(+), 377 deletions(-)

diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index 44ef4e24165..c10f80a723c 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1,17 +1,22 @@
-from test import support
-from test.support import os_helper
-from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
-                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
-                     open as tokenize_open, Untokenizer, generate_tokens,
-                     NEWLINE)
-from io import BytesIO, StringIO
+import contextlib
+import itertools
+import os
+import re
+import string
+import tempfile
+import token
+import tokenize
 import unittest
+from io import BytesIO, StringIO
 from textwrap import dedent
 from unittest import TestCase, mock
-from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
-                               INVALID_UNDERSCORE_LITERALS)
-import os
-import token
+from test import support
+from test.support import os_helper
+from test.support.script_helper import run_test_script, make_script, run_python_until_end
+from test.support.numbers import (
+    VALID_UNDERSCORE_LITERALS,
+    INVALID_UNDERSCORE_LITERALS,
+)
 
 
 # Converts a source string into a list of textual representation
@@ -24,12 +29,12 @@ def stringify_tokens_from_source(token_generator, source_string):
     missing_trailing_nl = source_string[-1] not in '\r\n'
 
     for type, token, start, end, line in token_generator:
-        if type == ENDMARKER:
+        if type == tokenize.ENDMARKER:
             break
         # Ignore the new line on the last line if the input lacks one
-        if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
+        if missing_trailing_nl and type == tokenize.NEWLINE and end[0] == num_lines:
             continue
-        type = tok_name[type]
+        type = tokenize.tok_name[type]
         result.append(f"    {type:10} {token!r:13} {start} {end}")
 
     return result
@@ -45,18 +50,37 @@ def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
         # The ENDMARKER and final NEWLINE are omitted.
         f = BytesIO(s.encode('utf-8'))
-        result = stringify_tokens_from_source(tokenize(f.readline), s)
+        result = stringify_tokens_from_source(tokenize.tokenize(f.readline), s)
         self.assertEqual(result,
                          ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
                          expected.rstrip().splitlines())
 
+    def test_invalid_readline(self):
+        def gen():
+            yield "sdfosdg"
+            yield "sdfosdg"
+        with self.assertRaises(TypeError):
+            list(tokenize.tokenize(gen().__next__))
+
+        def gen():
+            yield b"sdfosdg"
+            yield b"sdfosdg"
+        with self.assertRaises(TypeError):
+            list(tokenize.generate_tokens(gen().__next__))
+
+        def gen():
+            yield "sdfosdg"
+            1/0
+        with self.assertRaises(ZeroDivisionError):
+            list(tokenize.generate_tokens(gen().__next__))
+
     def test_implicit_newline(self):
         # Make sure that the tokenizer puts in an implicit NEWLINE
         # when the input lacks a trailing new line.
         f = BytesIO("x".encode('utf-8'))
-        tokens = list(tokenize(f.readline))
-        self.assertEqual(tokens[-2].type, NEWLINE)
-        self.assertEqual(tokens[-1].type, ENDMARKER)
+        tokens = list(tokenize.tokenize(f.readline))
+        self.assertEqual(tokens[-2].type, tokenize.NEWLINE)
+        self.assertEqual(tokens[-1].type, tokenize.ENDMARKER)
 
     def test_basic(self):
         self.check_tokenize("1 + 1", """\
@@ -83,6 +107,32 @@ def test_basic(self):
     NEWLINE    '\\n'          (4, 26) (4, 27)
     DEDENT     ''            (5, 0) (5, 0)
     """)
+
+        self.check_tokenize("if True:\r\n    # NL\r\n    foo='bar'\r\n\r\n", """\
+    NAME       'if'          (1, 0) (1, 2)
+    NAME       'True'        (1, 3) (1, 7)
+    OP         ':'           (1, 7) (1, 8)
+    NEWLINE    '\\r\\n'        (1, 8) (1, 10)
+    COMMENT    '# NL'        (2, 4) (2, 8)
+    NL         '\\r\\n'        (2, 8) (2, 10)
+    INDENT     '    '        (3, 0) (3, 4)
+    NAME       'foo'         (3, 4) (3, 7)
+    OP         '='           (3, 7) (3, 8)
+    STRING     "\'bar\'"       (3, 8) (3, 13)
+    NEWLINE    '\\r\\n'        (3, 13) (3, 15)
+    NL         '\\r\\n'        (4, 0) (4, 2)
+    DEDENT     ''            (5, 0) (5, 0)
+            """)
+
+        self.check_tokenize("x = 1 + \\\r\n1\r\n", """\
+    NAME       'x'           (1, 0) (1, 1)
+    OP         '='           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 4) (1, 5)
+    OP         '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (2, 0) (2, 1)
+    NEWLINE    '\\r\\n'        (2, 1) (2, 3)
+            """)
+
         indent_error_file = b"""\
 def k(x):
     x += 2
@@ -91,9 +141,18 @@ def k(x):
         readline = BytesIO(indent_error_file).readline
         with self.assertRaisesRegex(IndentationError,
                                     "unindent does not match any "
-                                    "outer indentation level"):
-            for tok in tokenize(readline):
+                                    "outer indentation level") as e:
+            for tok in tokenize.tokenize(readline):
                 pass
+        self.assertEqual(e.exception.lineno, 3)
+        self.assertEqual(e.exception.filename, '<string>')
+        self.assertEqual(e.exception.end_lineno, None)
+        self.assertEqual(e.exception.end_offset, None)
+        self.assertEqual(
+            e.exception.msg,
+            'unindent does not match any outer indentation level')
+        self.assertEqual(e.exception.offset, 9)
+        self.assertEqual(e.exception.text, '  x += 5')
 
     def test_int(self):
         # Ordinary integers and binary operators
@@ -177,7 +236,7 @@ def test_long(self):
     """)
 
     def test_float(self):
-        # Floating point numbers
+        # Floating-point numbers
         self.check_tokenize("x = 3.14159", """\
     NAME       'x'           (1, 0) (1, 1)
     OP         '='           (1, 2) (1, 3)
@@ -219,8 +278,8 @@ def test_float(self):
     def test_underscore_literals(self):
         def number_token(s):
             f = BytesIO(s.encode('utf-8'))
-            for toktype, token, start, end, line in tokenize(f.readline):
-                if toktype == NUMBER:
+            for toktype, token, start, end, line in tokenize.tokenize(f.readline):
+                if toktype == tokenize.NUMBER:
                     return token
             return 'invalid token'
         for lit in VALID_UNDERSCORE_LITERALS:
@@ -228,7 +287,16 @@ def number_token(s):
                 # this won't work with compound complex inputs
                 continue
             self.assertEqual(number_token(lit), lit)
+        # Valid cases with extra underscores in the tokenize module
+        # See gh-105549 for context
+        extra_valid_cases = {"0_7", "09_99"}
         for lit in INVALID_UNDERSCORE_LITERALS:
+            if lit in extra_valid_cases:
+                continue
+            try:
+                number_token(lit)
+            except tokenize.TokenError:
+                continue
             self.assertNotEqual(number_token(lit), lit)
 
     def test_string(self):
@@ -380,21 +448,175 @@ def test_string(self):
     STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
     """)
         self.check_tokenize('f"abc"', """\
-    STRING     'f"abc"'      (1, 0) (1, 6)
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc'         (1, 2) (1, 5)
+    FSTRING_END '"'           (1, 5) (1, 6)
     """)
         self.check_tokenize('fR"a{b}c"', """\
-    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
+    FSTRING_START 'fR"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'a'           (1, 3) (1, 4)
+    OP         '{'           (1, 4) (1, 5)
+    NAME       'b'           (1, 5) (1, 6)
+    OP         '}'           (1, 6) (1, 7)
+    FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
+    FSTRING_END '"'           (1, 8) (1, 9)
+    """)
+        self.check_tokenize('fR"a{{{b!r}}}c"', """\
+    FSTRING_START 'fR"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'a{'          (1, 3) (1, 5)
+    OP         '{'           (1, 6) (1, 7)
+    NAME       'b'           (1, 7) (1, 8)
+    OP         '!'           (1, 8) (1, 9)
+    NAME       'r'           (1, 9) (1, 10)
+    OP         '}'           (1, 10) (1, 11)
+    FSTRING_MIDDLE '}'           (1, 11) (1, 12)
+    FSTRING_MIDDLE 'c'           (1, 13) (1, 14)
+    FSTRING_END '"'           (1, 14) (1, 15)
+    """)
+        self.check_tokenize('f"{{{1+1}}}"', """\
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE '{'           (1, 2) (1, 3)
+    OP         '{'           (1, 4) (1, 5)
+    NUMBER     '1'           (1, 5) (1, 6)
+    OP         '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 7) (1, 8)
+    OP         '}'           (1, 8) (1, 9)
+    FSTRING_MIDDLE '}'           (1, 9) (1, 10)
+    FSTRING_END '"'           (1, 11) (1, 12)
+    """)
+        self.check_tokenize('f"""{f\'\'\'{f\'{f"{1+1}"}\'}\'\'\'}"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    OP         '{'           (1, 4) (1, 5)
+    FSTRING_START "f'''"        (1, 5) (1, 9)
+    OP         '{'           (1, 9) (1, 10)
+    FSTRING_START "f'"          (1, 10) (1, 12)
+    OP         '{'           (1, 12) (1, 13)
+    FSTRING_START 'f"'          (1, 13) (1, 15)
+    OP         '{'           (1, 15) (1, 16)
+    NUMBER     '1'           (1, 16) (1, 17)
+    OP         '+'           (1, 17) (1, 18)
+    NUMBER     '1'           (1, 18) (1, 19)
+    OP         '}'           (1, 19) (1, 20)
+    FSTRING_END '"'           (1, 20) (1, 21)
+    OP         '}'           (1, 21) (1, 22)
+    FSTRING_END "'"           (1, 22) (1, 23)
+    OP         '}'           (1, 23) (1, 24)
+    FSTRING_END "'''"         (1, 24) (1, 27)
+    OP         '}'           (1, 27) (1, 28)
+    FSTRING_END '\"""'         (1, 28) (1, 31)
+    """)
+        self.check_tokenize('f"""     x\nstr(data, encoding={invalid!r})\n"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE '     x\\nstr(data, encoding=' (1, 4) (2, 19)
+    OP         '{'           (2, 19) (2, 20)
+    NAME       'invalid'     (2, 20) (2, 27)
+    OP         '!'           (2, 27) (2, 28)
+    NAME       'r'           (2, 28) (2, 29)
+    OP         '}'           (2, 29) (2, 30)
+    FSTRING_MIDDLE ')\\n'         (2, 30) (3, 0)
+    FSTRING_END '\"""'         (3, 0) (3, 3)
+    """)
+        self.check_tokenize('f"""123456789\nsomething{None}bad"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE '123456789\\nsomething' (1, 4) (2, 9)
+    OP         '{'           (2, 9) (2, 10)
+    NAME       'None'        (2, 10) (2, 14)
+    OP         '}'           (2, 14) (2, 15)
+    FSTRING_MIDDLE 'bad'         (2, 15) (2, 18)
+    FSTRING_END '\"""'         (2, 18) (2, 21)
     """)
         self.check_tokenize('f"""abc"""', """\
-    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE 'abc'         (1, 4) (1, 7)
+    FSTRING_END '\"""'         (1, 7) (1, 10)
     """)
         self.check_tokenize(r'f"abc\
 def"', """\
-    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 2) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
     """)
         self.check_tokenize(r'Rf"abc\
 def"', """\
-    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
+    FSTRING_START 'Rf"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
+    """)
+        self.check_tokenize("f'some words {a+b:.3f} more words {c+d=} final words'", """\
+    FSTRING_START "f'"          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'some words ' (1, 2) (1, 13)
+    OP         '{'           (1, 13) (1, 14)
+    NAME       'a'           (1, 14) (1, 15)
+    OP         '+'           (1, 15) (1, 16)
+    NAME       'b'           (1, 16) (1, 17)
+    OP         ':'           (1, 17) (1, 18)
+    FSTRING_MIDDLE '.3f'         (1, 18) (1, 21)
+    OP         '}'           (1, 21) (1, 22)
+    FSTRING_MIDDLE ' more words ' (1, 22) (1, 34)
+    OP         '{'           (1, 34) (1, 35)
+    NAME       'c'           (1, 35) (1, 36)
+    OP         '+'           (1, 36) (1, 37)
+    NAME       'd'           (1, 37) (1, 38)
+    OP         '='           (1, 38) (1, 39)
+    OP         '}'           (1, 39) (1, 40)
+    FSTRING_MIDDLE ' final words' (1, 40) (1, 52)
+    FSTRING_END "'"           (1, 52) (1, 53)
+    """)
+        self.check_tokenize("""\
+f'''{
+3
+=}'''""", """\
+    FSTRING_START "f'''"        (1, 0) (1, 4)
+    OP         '{'           (1, 4) (1, 5)
+    NL         '\\n'          (1, 5) (1, 6)
+    NUMBER     '3'           (2, 0) (2, 1)
+    NL         '\\n'          (2, 1) (2, 2)
+    OP         '='           (3, 0) (3, 1)
+    OP         '}'           (3, 1) (3, 2)
+    FSTRING_END "'''"         (3, 2) (3, 5)
+    """)
+        self.check_tokenize("""\
+f'''__{
+    x:a
+}__'''""", """\
+    FSTRING_START "f'''"        (1, 0) (1, 4)
+    FSTRING_MIDDLE '__'          (1, 4) (1, 6)
+    OP         '{'           (1, 6) (1, 7)
+    NL         '\\n'          (1, 7) (1, 8)
+    NAME       'x'           (2, 4) (2, 5)
+    OP         ':'           (2, 5) (2, 6)
+    FSTRING_MIDDLE 'a\\n'         (2, 6) (3, 0)
+    OP         '}'           (3, 0) (3, 1)
+    FSTRING_MIDDLE '__'          (3, 1) (3, 3)
+    FSTRING_END "'''"         (3, 3) (3, 6)
+    """)
+        self.check_tokenize("""\
+f'''__{
+    x:a
+    b
+     c
+      d
+}__'''""", """\
+    FSTRING_START "f'''"        (1, 0) (1, 4)
+    FSTRING_MIDDLE '__'          (1, 4) (1, 6)
+    OP         '{'           (1, 6) (1, 7)
+    NL         '\\n'          (1, 7) (1, 8)
+    NAME       'x'           (2, 4) (2, 5)
+    OP         ':'           (2, 5) (2, 6)
+    FSTRING_MIDDLE 'a\\n    b\\n     c\\n      d\\n' (2, 6) (6, 0)
+    OP         '}'           (6, 0) (6, 1)
+    FSTRING_MIDDLE '__'          (6, 1) (6, 3)
+    FSTRING_END "'''"         (6, 3) (6, 6)
+    """)
+
+        self.check_tokenize("""\
+    '''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli
+    aktualni pracownicy, obecni pracownicy'''
+""", """\
+    INDENT     '    '        (1, 0) (1, 4)
+    STRING     "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n    aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45)
+    NEWLINE    '\\n'          (2, 45) (2, 46)
+    DEDENT     ''            (3, 0) (3, 0)
     """)
 
     def test_function(self):
@@ -945,29 +1167,98 @@ async def bar(): pass
     DEDENT     ''            (7, 0) (7, 0)
     """)
 
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; +  "    NEWLINE    '\\n'          (4, 1) (4, 2)"]
+    def test_newline_after_parenthesized_block_with_comment(self):
+        self.check_tokenize('''\
+[
+    # A comment here
+    1
+]
+''', """\
+    OP         '['           (1, 0) (1, 1)
+    NL         '\\n'          (1, 1) (1, 2)
+    COMMENT    '# A comment here' (2, 4) (2, 20)
+    NL         '\\n'          (2, 20) (2, 21)
+    NUMBER     '1'           (3, 4) (3, 5)
+    NL         '\\n'          (3, 5) (3, 6)
+    OP         ']'           (4, 0) (4, 1)
+    NEWLINE    '\\n'          (4, 1) (4, 2)
+    """)
+
+    def test_closing_parenthesis_from_different_line(self):
+        self.check_tokenize("); x", """\
+    OP         ')'           (1, 0) (1, 1)
+    OP         ';'           (1, 1) (1, 2)
+    NAME       'x'           (1, 3) (1, 4)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; '    FSTRING_END "\'\'\'"         (2, 68) (2, 71)']
+    def test_multiline_non_ascii_fstring(self):
+        self.check_tokenize("""\
+a = f'''
+    Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli'''""", """\
+    NAME       'a'           (1, 0) (1, 1)
+    OP         '='           (1, 2) (1, 3)
+    FSTRING_START "f\'\'\'"        (1, 4) (1, 8)
+    FSTRING_MIDDLE '\\n    Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli' (1, 8) (2, 68)
+    FSTRING_END "\'\'\'"         (2, 68) (2, 71)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; Diff is 696 characters long. Set self.maxDiff to None to see it.
+    def test_multiline_non_ascii_fstring_with_expr(self):
+        self.check_tokenize("""\
+f'''
+    🔗 This is a test {test_arg1}🔗
+🔗'''""", """\
+    FSTRING_START "f\'\'\'"        (1, 0) (1, 4)
+    FSTRING_MIDDLE '\\n    🔗 This is a test ' (1, 4) (2, 21)
+    OP         '{'           (2, 21) (2, 22)
+    NAME       'test_arg1'   (2, 22) (2, 31)
+    OP         '}'           (2, 31) (2, 32)
+    FSTRING_MIDDLE '🔗\\n🔗'        (2, 32) (3, 1)
+    FSTRING_END "\'\'\'"         (3, 1) (3, 4)
+    """)
+
+        # gh-139516, the '\n' is explicit to ensure no trailing whitespace which would invalidate the test
+        self.check_tokenize('''f"{f(a=lambda: 'à'\n)}"''', """\
+    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    OP         '{'           (1, 2) (1, 3)
+    NAME       'f'           (1, 3) (1, 4)
+    OP         '('           (1, 4) (1, 5)
+    NAME       'a'           (1, 5) (1, 6)
+    OP         '='           (1, 6) (1, 7)
+    NAME       'lambda'      (1, 7) (1, 13)
+    OP         ':'           (1, 13) (1, 14)
+    STRING     "\'à\'"         (1, 15) (1, 18)
+    NL         '\\n'          (1, 18) (1, 19)
+    OP         ')'           (2, 0) (2, 1)
+    OP         '}'           (2, 1) (2, 2)
+    FSTRING_END \'"\'           (2, 2) (2, 3)
+    """)
+
 class GenerateTokensTest(TokenizeTest):
     def check_tokenize(self, s, expected):
         # Format the tokens in s in a table format.
         # The ENDMARKER and final NEWLINE are omitted.
         f = StringIO(s)
-        result = stringify_tokens_from_source(generate_tokens(f.readline), s)
+        result = stringify_tokens_from_source(tokenize.generate_tokens(f.readline), s)
         self.assertEqual(result, expected.rstrip().splitlines())
 
 
 def decistmt(s):
     result = []
-    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
+    g = tokenize.tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
     for toknum, tokval, _, _, _  in g:
-        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
+        if toknum == tokenize.NUMBER and '.' in tokval:  # replace NUMBER tokens
             result.extend([
-                (NAME, 'Decimal'),
-                (OP, '('),
-                (STRING, repr(tokval)),
-                (OP, ')')
+                (tokenize.NAME, 'Decimal'),
+                (tokenize.OP, '('),
+                (tokenize.STRING, repr(tokval)),
+                (tokenize.OP, ')')
             ])
         else:
             result.append((toknum, tokval))
-    return untokenize(result).decode('utf-8')
+    return tokenize.untokenize(result).decode('utf-8').strip()
 
 class TestMisc(TestCase):
 
@@ -991,6 +1282,13 @@ def test_decistmt(self):
         self.assertEqual(eval(decistmt(s)),
                          Decimal('-3.217160342717258261933904529E-7'))
 
+    def test___all__(self):
+        expected = token.__all__ + [
+            "TokenInfo", "TokenError", "generate_tokens",
+            "detect_encoding", "untokenize", "open", "tokenize",
+        ]
+        self.assertCountEqual(tokenize.__all__, expected)
+
 
 class TestTokenizerAdheresToPep0263(TestCase):
     """
@@ -998,8 +1296,9 @@ class TestTokenizerAdheresToPep0263(TestCase):
     """
 
     def _testFile(self, filename):
-        path = os.path.join(os.path.dirname(__file__), filename)
-        TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
+        path = os.path.join(os.path.dirname(__file__), 'tokenizedata', filename)
+        with open(path, 'rb') as f:
+            TestRoundtrip.check_roundtrip(self, f)
 
     def test_utf8_coding_cookie_and_no_utf8_bom(self):
         f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
@@ -1024,8 +1323,6 @@ def test_utf8_coding_cookie_and_utf8_bom(self):
         f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
         self._testFile(f)
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure # "bad_coding.py" and "bad_coding2.py" make the WASM CI fail
     def test_bad_coding_cookie(self):
         self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
         self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
@@ -1041,33 +1338,18 @@ def readline():
             nonlocal first
             if not first:
                 first = True
-                return line
+                yield line
             else:
-                return b''
+                yield b''
 
         # skip the initial encoding token and the end tokens
-        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
+        tokens = list(tokenize._generate_tokens_from_c_tokenizer(readline().__next__,
+                                                                 encoding='utf-8',
+                                                                 extra_tokens=True))[:-2]
+        expected_tokens = [tokenize.TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
         self.assertEqual(tokens, expected_tokens,
                          "bytes not decoded with encoding")
 
-    def test__tokenize_does_not_decode_with_encoding_none(self):
-        literal = '"ЉЊЈЁЂ"'
-        first = False
-        def readline():
-            nonlocal first
-            if not first:
-                first = True
-                return literal
-            else:
-                return b''
-
-        # skip the end tokens
-        tokens = list(_tokenize(readline, encoding=None))[:-2]
-        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
-        self.assertEqual(tokens, expected_tokens,
-                         "string not tokenized when encoding is None")
-
 
 class TestDetectEncoding(TestCase):
 
@@ -1084,24 +1366,63 @@ def readline():
 
     def test_no_bom_no_encoding_cookie(self):
         lines = (
-            b'# something\n',
+            b'#!/home/\xc3\xa4/bin/python\n',
+            b'# something \xe2\x82\xac\n',
             b'print(something)\n',
             b'do_something(else)\n'
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'utf-8')
         self.assertEqual(consumed_lines, list(lines[:2]))
 
+    def test_no_bom_no_encoding_cookie_first_line_error(self):
+        lines = (
+            b'#!/home/\xa4/bin/python\n\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_no_bom_no_encoding_cookie_second_line_error(self):
+        lines = (
+            b'#!/usr/bin/python\n',
+            b'# something \xe2\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
     def test_bom_no_cookie(self):
         lines = (
-            b'\xef\xbb\xbf# something\n',
+            b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n',
             b'print(something)\n',
             b'do_something(else)\n'
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'utf-8-sig')
         self.assertEqual(consumed_lines,
-                         [b'# something\n', b'print(something)\n'])
+                         [b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n'])
+
+    def test_bom_no_cookie_first_line_error(self):
+        lines = (
+            b'\xef\xbb\xbf#!/home/\xa4/bin/python\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_bom_no_cookie_second_line_error(self):
+        lines = (
+            b'\xef\xbb\xbf#!/usr/bin/python\n',
+            b'# something \xe2\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
 
     def test_cookie_first_line_no_bom(self):
         lines = (
@@ -1109,7 +1430,7 @@ def test_cookie_first_line_no_bom(self):
             b'print(something)\n',
             b'do_something(else)\n'
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'iso-8859-1')
         self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
 
@@ -1119,7 +1440,7 @@ def test_matched_bom_and_cookie_first_line(self):
             b'print(something)\n',
             b'do_something(else)\n'
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'utf-8-sig')
         self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
 
@@ -1130,7 +1451,7 @@ def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
             b'do_something(else)\n'
         )
         readline = self.get_readline(lines)
-        self.assertRaises(SyntaxError, detect_encoding, readline)
+        self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
 
     def test_cookie_second_line_no_bom(self):
         lines = (
@@ -1139,7 +1460,7 @@ def test_cookie_second_line_no_bom(self):
             b'print(something)\n',
             b'do_something(else)\n'
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'ascii')
         expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
         self.assertEqual(consumed_lines, expected)
@@ -1151,7 +1472,7 @@ def test_matched_bom_and_cookie_second_line(self):
             b'print(something)\n',
             b'do_something(else)\n'
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'utf-8-sig')
         self.assertEqual(consumed_lines,
                          [b'#! something\n', b'f# coding=utf-8\n'])
@@ -1164,7 +1485,7 @@ def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
             b'do_something(else)\n'
         )
         readline = self.get_readline(lines)
-        self.assertRaises(SyntaxError, detect_encoding, readline)
+        self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
 
     def test_cookie_second_line_noncommented_first_line(self):
         lines = (
@@ -1172,21 +1493,65 @@ def test_cookie_second_line_noncommented_first_line(self):
             b'# vim: set fileencoding=iso8859-15 :\n',
             b"print('\xe2\x82\xac')\n"
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'utf-8')
         expected = [b"print('\xc2\xa3')\n"]
         self.assertEqual(consumed_lines, expected)
 
-    def test_cookie_second_line_commented_first_line(self):
+    def test_first_non_utf8_coding_line(self):
         lines = (
-            b"#print('\xc2\xa3')\n",
-            b'# vim: set fileencoding=iso8859-15 :\n',
-            b"print('\xe2\x82\xac')\n"
+            b'#coding:iso-8859-15 \xa4\n',
+            b'print(something)\n'
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
-        self.assertEqual(encoding, 'iso8859-15')
-        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
-        self.assertEqual(consumed_lines, expected)
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso-8859-15')
+        self.assertEqual(consumed_lines, list(lines[:1]))
+
+    def test_first_utf8_coding_line_error(self):
+        lines = (
+            b'#coding:ascii \xc3\xa4\n',
+            b'print(something)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_second_non_utf8_coding_line(self):
+        lines = (
+            b'#!/usr/bin/python\n',
+            b'#coding:iso-8859-15 \xa4\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso-8859-15')
+        self.assertEqual(consumed_lines, list(lines[:2]))
+
+    def test_second_utf8_coding_line_error(self):
+        lines = (
+            b'#!/usr/bin/python\n',
+            b'#coding:ascii \xc3\xa4\n',
+            b'print(something)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_non_utf8_shebang(self):
+        lines = (
+            b'#!/home/\xa4/bin/python\n',
+            b'#coding:iso-8859-15\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso-8859-15')
+        self.assertEqual(consumed_lines, list(lines[:2]))
+
+    def test_utf8_shebang_error(self):
+        lines = (
+            b'#!/home/\xc3\xa4/bin/python\n',
+            b'#coding:ascii\n',
+            b'print(something)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
 
     def test_cookie_second_line_empty_first_line(self):
         lines = (
@@ -1194,13 +1559,77 @@ def test_cookie_second_line_empty_first_line(self):
             b'# vim: set fileencoding=iso8859-15 :\n',
             b"print('\xe2\x82\xac')\n"
         )
-        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
         self.assertEqual(encoding, 'iso8859-15')
         expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
         self.assertEqual(consumed_lines, expected)
 
+    def test_cookie_third_line(self):
+        lines = (
+            b'#!/home/\xc3\xa4/bin/python\n',
+            b'# something\n',
+            b'# vim: set fileencoding=ascii :\n',
+            b'print(something)\n',
+            b'do_something(else)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8')
+        self.assertEqual(consumed_lines, list(lines[:2]))
+
+    def test_double_coding_line(self):
+        # If the first line matches the second line is ignored.
+        lines = (
+            b'#coding:iso8859-15\n',
+            b'#coding:latin1\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        self.assertEqual(consumed_lines, list(lines[:1]))
+
+    def test_double_coding_same_line(self):
+        lines = (
+            b'#coding:iso8859-15 coding:latin1\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso8859-15')
+        self.assertEqual(consumed_lines, list(lines[:1]))
+
+    def test_double_coding_utf8(self):
+        lines = (
+            b'#coding:utf-8\n',
+            b'#coding:latin1\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'utf-8')
+        self.assertEqual(consumed_lines, list(lines[:1]))
+
+    def test_nul_in_first_coding_line(self):
+        lines = (
+            b'#coding:iso8859-15\x00\n',
+            b'\n',
+            b'\n',
+            b'print(something)\n'
+        )
+        with self.assertRaisesRegex(SyntaxError,
+                "source code cannot contain null bytes"):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_nul_in_second_coding_line(self):
+        lines = (
+            b'#!/usr/bin/python\n',
+            b'#coding:iso8859-15\x00\n',
+            b'\n',
+            b'print(something)\n'
+        )
+        with self.assertRaisesRegex(SyntaxError,
+                "source code cannot contain null bytes"):
+            tokenize.detect_encoding(self.get_readline(lines))
+
     def test_latin1_normalization(self):
-        # See get_normal_name() in tokenizer.c.
+        # See get_normal_name() in Parser/tokenizer/helpers.c.
         encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
                      "iso-8859-1-unix", "iso-latin-1-mac")
         for encoding in encodings:
@@ -1211,21 +1640,20 @@ def test_latin1_normalization(self):
                          b"print(things)\n",
                          b"do_something += 4\n")
                 rl = self.get_readline(lines)
-                found, consumed_lines = detect_encoding(rl)
+                found, consumed_lines = tokenize.detect_encoding(rl)
                 self.assertEqual(found, "iso-8859-1")
 
     def test_syntaxerror_latin1(self):
-        # Issue 14629: need to raise SyntaxError if the first
+        # Issue 14629: need to raise TokenError if the first
         # line(s) have non-UTF-8 characters
         lines = (
             b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
             )
         readline = self.get_readline(lines)
-        self.assertRaises(SyntaxError, detect_encoding, readline)
-
+        self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
 
     def test_utf8_normalization(self):
-        # See get_normal_name() in tokenizer.c.
+        # See get_normal_name() in Parser/tokenizer/helpers.c.
         encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
         for encoding in encodings:
             for rep in ("-", "_"):
@@ -1234,39 +1662,40 @@ def test_utf8_normalization(self):
                          b"# coding: " + enc.encode("ascii") + b"\n",
                          b"1 + 3\n")
                 rl = self.get_readline(lines)
-                found, consumed_lines = detect_encoding(rl)
+                found, consumed_lines = tokenize.detect_encoding(rl)
                 self.assertEqual(found, "utf-8")
 
     def test_short_files(self):
         readline = self.get_readline((b'print(something)\n',))
-        encoding, consumed_lines = detect_encoding(readline)
+        encoding, consumed_lines = tokenize.detect_encoding(readline)
         self.assertEqual(encoding, 'utf-8')
         self.assertEqual(consumed_lines, [b'print(something)\n'])
 
-        encoding, consumed_lines = detect_encoding(self.get_readline(()))
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(()))
         self.assertEqual(encoding, 'utf-8')
         self.assertEqual(consumed_lines, [])
 
         readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
-        encoding, consumed_lines = detect_encoding(readline)
+        encoding, consumed_lines = tokenize.detect_encoding(readline)
         self.assertEqual(encoding, 'utf-8-sig')
         self.assertEqual(consumed_lines, [b'print(something)\n'])
 
         readline = self.get_readline((b'\xef\xbb\xbf',))
-        encoding, consumed_lines = detect_encoding(readline)
+        encoding, consumed_lines = tokenize.detect_encoding(readline)
         self.assertEqual(encoding, 'utf-8-sig')
         self.assertEqual(consumed_lines, [])
 
         readline = self.get_readline((b'# coding: bad\n',))
-        self.assertRaises(SyntaxError, detect_encoding, readline)
+        self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
 
     def test_false_encoding(self):
         # Issue 18873: "Encoding" detected in non-comment lines
         readline = self.get_readline((b'print("#coding=fake")',))
-        encoding, consumed_lines = detect_encoding(readline)
+        encoding, consumed_lines = tokenize.detect_encoding(readline)
         self.assertEqual(encoding, 'utf-8')
         self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
 
+    @support.thread_unsafe
     def test_open(self):
         filename = os_helper.TESTFN + '.py'
         self.addCleanup(os_helper.unlink, filename)
@@ -1276,14 +1705,14 @@ def test_open(self):
             with open(filename, 'w', encoding=encoding) as fp:
                 print("# coding: %s" % encoding, file=fp)
                 print("print('euro:\u20ac')", file=fp)
-            with tokenize_open(filename) as fp:
+            with tokenize.open(filename) as fp:
                 self.assertEqual(fp.encoding, encoding)
                 self.assertEqual(fp.mode, 'r')
 
         # test BOM (no coding cookie)
         with open(filename, 'w', encoding='utf-8-sig') as fp:
             print("print('euro:\u20ac')", file=fp)
-        with tokenize_open(filename) as fp:
+        with tokenize.open(filename) as fp:
             self.assertEqual(fp.encoding, 'utf-8-sig')
             self.assertEqual(fp.mode, 'r')
 
@@ -1310,16 +1739,16 @@ def readline(self):
             ins = Bunk(lines, path)
             # Make sure lacking a name isn't an issue.
             del ins.name
-            detect_encoding(ins.readline)
+            tokenize.detect_encoding(ins.readline)
         with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
             ins = Bunk(lines, path)
-            detect_encoding(ins.readline)
+            tokenize.detect_encoding(ins.readline)
 
     def test_open_error(self):
         # Issue #23840: open() must close the binary file on error
         m = BytesIO(b'#coding:xxx')
         with mock.patch('tokenize._builtin_open', return_value=m):
-            self.assertRaises(SyntaxError, tokenize_open, 'foobar')
+            self.assertRaises(SyntaxError, tokenize.open, 'foobar')
         self.assertTrue(m.closed)
 
 
@@ -1327,17 +1756,20 @@ class TestTokenize(TestCase):
 
     def test_tokenize(self):
         import tokenize as tokenize_module
-        encoding = object()
+        encoding = "utf-8"
         encoding_used = None
         def mock_detect_encoding(readline):
             return encoding, [b'first', b'second']
 
-        def mock__tokenize(readline, encoding):
+        def mock__tokenize(readline, encoding, **kwargs):
             nonlocal encoding_used
             encoding_used = encoding
             out = []
             while True:
-                next_line = readline()
+                try:
+                    next_line = readline()
+                except StopIteration:
+                    return out
                 if next_line:
                     out.append(next_line)
                     continue
@@ -1352,16 +1784,16 @@ def mock_readline():
             return str(counter).encode()
 
         orig_detect_encoding = tokenize_module.detect_encoding
-        orig__tokenize = tokenize_module._tokenize
+        orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer
         tokenize_module.detect_encoding = mock_detect_encoding
-        tokenize_module._tokenize = mock__tokenize
+        tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize
         try:
-            results = tokenize(mock_readline)
-            self.assertEqual(list(results),
+            results = tokenize.tokenize(mock_readline)
+            self.assertEqual(list(results)[1:],
                              [b'first', b'second', b'1', b'2', b'3', b'4'])
         finally:
             tokenize_module.detect_encoding = orig_detect_encoding
-            tokenize_module._tokenize = orig__tokenize
+            tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token
 
         self.assertEqual(encoding_used, encoding)
 
@@ -1373,23 +1805,23 @@ def test_oneline_defs(self):
         buf = '\n'.join(buf)
 
         # Test that 500 consequent, one-line defs is OK
-        toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
+        toks = list(tokenize.tokenize(BytesIO(buf.encode('utf-8')).readline))
         self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
                                                 # [-2] is always NEWLINE
 
     def assertExactTypeEqual(self, opstr, *optypes):
-        tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
+        tokens = list(tokenize.tokenize(BytesIO(opstr.encode('utf-8')).readline))
         num_optypes = len(optypes)
         self.assertEqual(len(tokens), 3 + num_optypes)
-        self.assertEqual(tok_name[tokens[0].exact_type],
-                         tok_name[ENCODING])
+        self.assertEqual(tokenize.tok_name[tokens[0].exact_type],
+                         tokenize.tok_name[tokenize.ENCODING])
         for i in range(num_optypes):
-            self.assertEqual(tok_name[tokens[i + 1].exact_type],
-                             tok_name[optypes[i]])
-        self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
-                         tok_name[token.NEWLINE])
-        self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
-                         tok_name[token.ENDMARKER])
+            self.assertEqual(tokenize.tok_name[tokens[i + 1].exact_type],
+                             tokenize.tok_name[optypes[i]])
+        self.assertEqual(tokenize.tok_name[tokens[1 + num_optypes].exact_type],
+                         tokenize.tok_name[token.NEWLINE])
+        self.assertEqual(tokenize.tok_name[tokens[2 + num_optypes].exact_type],
+                         tokenize.tok_name[token.ENDMARKER])
 
     def test_exact_type(self):
         self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
@@ -1439,11 +1871,11 @@ def test_exact_type(self):
         self.assertExactTypeEqual('@=', token.ATEQUAL)
 
         self.assertExactTypeEqual('a**2+b**2==c**2',
-                                  NAME, token.DOUBLESTAR, NUMBER,
+                                  tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER,
                                   token.PLUS,
-                                  NAME, token.DOUBLESTAR, NUMBER,
+                                  tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER,
                                   token.EQEQUAL,
-                                  NAME, token.DOUBLESTAR, NUMBER)
+                                  tokenize.NAME, token.DOUBLESTAR, tokenize.NUMBER)
         self.assertExactTypeEqual('{1, 2, 3}',
                                   token.LBRACE,
                                   token.NUMBER, token.COMMA,
@@ -1463,19 +1895,55 @@ def test_pathological_trailing_whitespace(self):
     def test_comment_at_the_end_of_the_source_without_newline(self):
         # See http://bugs.python.org/issue44667
         source = 'b = 1\n\n#test'
-        expected_tokens = [token.NAME, token.EQUAL, token.NUMBER, token.NEWLINE, token.NL, token.COMMENT]
+        expected_tokens = [
+            tokenize.TokenInfo(type=token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''),
+            tokenize.TokenInfo(type=token.NAME, string='b', start=(1, 0), end=(1, 1), line='b = 1\n'),
+            tokenize.TokenInfo(type=token.OP, string='=', start=(1, 2), end=(1, 3), line='b = 1\n'),
+            tokenize.TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'),
+            tokenize.TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'),
+            tokenize.TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'),
+            tokenize.TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'),
+            tokenize.TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'),
+            tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='')
+        ]
+
+        tokens = list(tokenize.tokenize(BytesIO(source.encode('utf-8')).readline))
+        self.assertEqual(tokens, expected_tokens)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; Diff is 869 characters long. Set self.maxDiff to None to see it.
+    def test_newline_and_space_at_the_end_of_the_source_without_newline(self):
+        # See https://github.com/python/cpython/issues/105435
+        source = 'a\n '
+        expected_tokens = [
+            tokenize.TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''),
+            tokenize.TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'),
+            tokenize.TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'),
+            tokenize.TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '),
+            tokenize.TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='')
+        ]
+
+        tokens = list(tokenize.tokenize(BytesIO(source.encode('utf-8')).readline))
+        self.assertEqual(tokens, expected_tokens)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; AssertionError: b'SyntaxError' not found in b'OSError: stream did not contain valid UTF-8\n'
+    def test_invalid_character_in_fstring_middle(self):
+        # See gh-103824
+        script = b'''F"""
+        \xe5"""'''
+
+        with os_helper.temp_dir() as temp_dir:
+            filename = os.path.join(temp_dir, "script.py")
+            with open(filename, 'wb') as file:
+                file.write(script)
+            rs, _ = run_python_until_end(filename)
+            self.assertIn(b"SyntaxError", rs.err)
 
-        tokens = list(tokenize(BytesIO(source.encode('utf-8')).readline))
-        self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING])
-        for i in range(6):
-            self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[expected_tokens[i]])
-        self.assertEqual(tok_name[tokens[-1].exact_type], tok_name[token.ENDMARKER])
 
 class UntokenizeTest(TestCase):
 
     def test_bad_input_order(self):
         # raise if previous row
-        u = Untokenizer()
+        u = tokenize.Untokenizer()
         u.prev_row = 2
         u.prev_col = 2
         with self.assertRaises(ValueError) as cm:
@@ -1487,7 +1955,7 @@ def test_bad_input_order(self):
 
     def test_backslash_continuation(self):
         # The problem is that <whitespace>\<newline> leaves no token
-        u = Untokenizer()
+        u = tokenize.Untokenizer()
         u.prev_row = 1
         u.prev_col =  1
         u.tokens = []
@@ -1499,17 +1967,33 @@ def test_backslash_continuation(self):
         TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
 
     def test_iter_compat(self):
-        u = Untokenizer()
-        token = (NAME, 'Hello')
-        tokens = [(ENCODING, 'utf-8'), token]
+        u = tokenize.Untokenizer()
+        token = (tokenize.NAME, 'Hello')
+        tokens = [(tokenize.ENCODING, 'utf-8'), token]
         u.compat(token, iter([]))
         self.assertEqual(u.tokens, ["Hello "])
-        u = Untokenizer()
+        u = tokenize.Untokenizer()
         self.assertEqual(u.untokenize(iter([token])), 'Hello ')
-        u = Untokenizer()
+        u = tokenize.Untokenizer()
         self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
         self.assertEqual(u.encoding, 'utf-8')
-        self.assertEqual(untokenize(iter(tokens)), b'Hello ')
+        self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
+
+
+def contains_ambiguous_backslash(source):
+    """Return `True` if the source contains a backslash on a
+    line by itself. For example:
+
+    a = (1
+        \\
+    )
+
+    Code like this cannot be untokenized exactly. This is because
+    the tokenizer does not produce any tokens for the line containing
+    the backslash and so there is no way to know its indent.
+    """
+    pattern = re.compile(br'\n\s*\\\r?\n')
+    return pattern.search(source) is not None
 
 
 class TestRoundtrip(TestCase):
@@ -1522,6 +2006,9 @@ def check_roundtrip(self, f):
         tokenize.untokenize(), and the latter tokenized again to 2-tuples.
         The test fails if the 3 pair tokenizations do not match.
 
+        If the source code can be untokenized unambiguously, the
+        untokenized code must match the original code exactly.
+
         When untokenize bugs are fixed, untokenize with 5-tuples should
         reproduce code that does not contain a backslash continuation
         following spaces.  A proper test should test this.
@@ -1531,21 +2018,38 @@ def check_roundtrip(self, f):
             code = f.encode('utf-8')
         else:
             code = f.read()
-            f.close()
         readline = iter(code.splitlines(keepends=True)).__next__
-        tokens5 = list(tokenize(readline))
+        tokens5 = list(tokenize.tokenize(readline))
         tokens2 = [tok[:2] for tok in tokens5]
         # Reproduce tokens2 from pairs
-        bytes_from2 = untokenize(tokens2)
+        bytes_from2 = tokenize.untokenize(tokens2)
         readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
-        tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
+        tokens2_from2 = [tok[:2] for tok in tokenize.tokenize(readline2)]
         self.assertEqual(tokens2_from2, tokens2)
         # Reproduce tokens2 from 5-tuples
-        bytes_from5 = untokenize(tokens5)
+        bytes_from5 = tokenize.untokenize(tokens5)
         readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
-        tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
+        tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
         self.assertEqual(tokens2_from5, tokens2)
 
+        if not contains_ambiguous_backslash(code):
+            # The BOM does not produce a token so there is no way to preserve it.
+            code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
+            readline = iter(code_without_bom.splitlines(keepends=True)).__next__
+            untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
+            self.assertEqual(code_without_bom, untokenized_code)
+
+    def check_line_extraction(self, f):
+        if isinstance(f, str):
+            code = f.encode('utf-8')
+        else:
+            code = f.read()
+        readline = iter(code.splitlines(keepends=True)).__next__
+        for tok in tokenize.tokenize(readline):
+            if tok.type in  {tokenize.ENCODING, tokenize.ENDMARKER}:
+                continue
+            self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]])
+
     def test_roundtrip(self):
         # There are some standard formatting practices that are easy to get right.
 
@@ -1561,7 +2065,7 @@ def test_roundtrip(self):
 
         self.check_roundtrip("if x == 1 : \n"
                              "  print(x)\n")
-        fn = support.findfile("tokenize_tests.txt")
+        fn = support.findfile("tokenize_tests.txt", subdir="tokenizedata")
         with open(fn, 'rb') as f:
             self.check_roundtrip(f)
         self.check_roundtrip("if x == 1:\n"
@@ -1585,6 +2089,67 @@ def test_roundtrip(self):
                              "    print('Can not import' # comment2\n)"
                              "else:   print('Loaded')\n")
 
+        self.check_roundtrip("f'\\N{EXCLAMATION MARK}'")
+        self.check_roundtrip(r"f'\\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\N{{SNAKE}}'")
+        self.check_roundtrip(r"f'\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\\\\N{SNAKE}'")
+        self.check_roundtrip(r"f'\\\\\\\N{SNAKE}'")
+
+        self.check_roundtrip(r"f'\\N{1}'")
+        self.check_roundtrip(r"f'\\\\N{2}'")
+        self.check_roundtrip(r"f'\\\\\\N{3}'")
+        self.check_roundtrip(r"f'\\\\\\\\N{4}'")
+
+        self.check_roundtrip(r"f'\\N{{'")
+        self.check_roundtrip(r"f'\\\\N{{'")
+        self.check_roundtrip(r"f'\\\\\\N{{'")
+        self.check_roundtrip(r"f'\\\\\\\\N{{'")
+
+        self.check_roundtrip(r"f'\n{{foo}}'")
+        self.check_roundtrip(r"f'\\n{{foo}}'")
+        self.check_roundtrip(r"f'\\\n{{foo}}'")
+        self.check_roundtrip(r"f'\\\\n{{foo}}'")
+
+        self.check_roundtrip(r"f'\t{{foo}}'")
+        self.check_roundtrip(r"f'\\t{{foo}}'")
+        self.check_roundtrip(r"f'\\\t{{foo}}'")
+        self.check_roundtrip(r"f'\\\\t{{foo}}'")
+
+        self.check_roundtrip(r"rf'\t{{foo}}'")
+        self.check_roundtrip(r"rf'\\t{{foo}}'")
+        self.check_roundtrip(r"rf'\\\t{{foo}}'")
+        self.check_roundtrip(r"rf'\\\\t{{foo}}'")
+
+        self.check_roundtrip(r"rf'\{{foo}}'")
+        self.check_roundtrip(r"f'\\{{foo}}'")
+        self.check_roundtrip(r"rf'\\\{{foo}}'")
+        self.check_roundtrip(r"f'\\\\{{foo}}'")
+        cases = [
+    """
+if 1:
+    "foo"
+"bar"
+""",
+    """
+if 1:
+    ("foo"
+     "bar")
+""",
+    """
+if 1:
+    "foo"
+    "bar"
+""" ]
+        for case in cases:
+            self.check_roundtrip(case)
+
+        self.check_roundtrip(r"t'{ {}}'")
+        self.check_roundtrip(r"t'{f'{ {}}'}{ {}}'")
+        self.check_roundtrip(r"f'{t'{ {}}'}{ {}}'")
+
+
     def test_continuation(self):
         # Balancing continuation
         self.check_roundtrip("a = (3,4, \n"
@@ -1611,26 +2176,15 @@ def test_string_concatenation(self):
         # Two string literals on the same line
         self.check_roundtrip("'' ''")
 
-    # TODO: RUSTPYTHON
-    @unittest.expectedFailure
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_random_files(self):
         # Test roundtrip on random python modules.
         # pass the '-ucpu' option to process the full directory.
 
         import glob, random
-        fn = support.findfile("tokenize_tests.txt")
-        tempdir = os.path.dirname(fn) or os.curdir
+        tempdir = os.path.dirname(__file__) or os.curdir
         testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
 
-        # Tokenize is broken on test_pep3131.py because regular expressions are
-        # broken on the obscure unicode identifiers in it. *sigh*
-        # With roundtrip extended to test the 5-tuple mode of untokenize,
-        # 7 more testfiles fail.  Remove them also until the failure is diagnosed.
-
-        testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
-        for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
-            testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
-
         if not support.is_resource_enabled("cpu"):
             testfiles = random.sample(testfiles, 10)
 
@@ -1640,12 +2194,13 @@ def test_random_files(self):
             with open(testfile, 'rb') as f:
                 with self.subTest(file=testfile):
                     self.check_roundtrip(f)
+                    self.check_line_extraction(f)
 
 
     def roundtrip(self, code):
         if isinstance(code, str):
             code = code.encode('utf-8')
-        return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
+        return tokenize.untokenize(tokenize.tokenize(BytesIO(code).readline)).decode('utf-8')
 
     def test_indentation_semantics_retained(self):
         """
@@ -1658,5 +2213,1288 @@ def test_indentation_semantics_retained(self):
         self.check_roundtrip(code)
 
 
+class InvalidPythonTests(TestCase):
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; Diff is 1046 characters long. Set self.maxDiff to None to see it.
+    def test_number_followed_by_name(self):
+        # See issue #gh-105549
+        source = "2sin(x)"
+        expected_tokens = [
+            tokenize.TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'),
+            tokenize.TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'),
+            tokenize.TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'),
+            tokenize.TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'),
+            tokenize.TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'),
+            tokenize.TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'),
+            tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
+        ]
+
+        tokens = list(tokenize.generate_tokens(StringIO(source).readline))
+        self.assertEqual(tokens, expected_tokens)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; Diff is 855 characters long. Set self.maxDiff to None to see it.
+    def test_number_starting_with_zero(self):
+        source = "01234"
+        expected_tokens = [
+            tokenize.TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'),
+            tokenize.TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'),
+            tokenize.TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
+        ]
+
+        tokens = list(tokenize.generate_tokens(StringIO(source).readline))
+        self.assertEqual(tokens, expected_tokens)
+
+class CTokenizeTest(TestCase):
+    def check_tokenize(self, s, expected):
+        # Format the tokens in s in a table format.
+        # The ENDMARKER and final NEWLINE are omitted.
+        f = StringIO(s)
+        with self.subTest(source=s):
+            result = stringify_tokens_from_source(
+                tokenize._generate_tokens_from_c_tokenizer(f.readline), s
+            )
+            self.assertEqual(result, expected.rstrip().splitlines())
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_encoding(self):
+        def readline(encoding):
+            yield "1+1".encode(encoding)
+
+        expected = [
+            tokenize.TokenInfo(type=tokenize.NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'),
+            tokenize.TokenInfo(type=tokenize.OP, string='+', start=(1, 1), end=(1, 2), line='1+1'),
+            tokenize.TokenInfo(type=tokenize.NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'),
+            tokenize.TokenInfo(type=tokenize.NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'),
+            tokenize.TokenInfo(type=tokenize.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
+        ]
+        for encoding in ["utf-8", "latin-1", "utf-16"]:
+            with self.subTest(encoding=encoding):
+                tokens = list(tokenize._generate_tokens_from_c_tokenizer(
+                    readline(encoding).__next__,
+                    extra_tokens=True,
+                    encoding=encoding,
+                ))
+                self.assertEqual(tokens, expected)
+
+    def test_int(self):
+
+        self.check_tokenize('0xff <= 255', """\
+    NUMBER     '0xff'        (1, 0) (1, 4)
+    LESSEQUAL  '<='          (1, 5) (1, 7)
+    NUMBER     '255'         (1, 8) (1, 11)
+    """)
+
+        self.check_tokenize('0b10 <= 255', """\
+    NUMBER     '0b10'        (1, 0) (1, 4)
+    LESSEQUAL  '<='          (1, 5) (1, 7)
+    NUMBER     '255'         (1, 8) (1, 11)
+    """)
+
+        self.check_tokenize('0o123 <= 0O123', """\
+    NUMBER     '0o123'       (1, 0) (1, 5)
+    LESSEQUAL  '<='          (1, 6) (1, 8)
+    NUMBER     '0O123'       (1, 9) (1, 14)
+    """)
+
+        self.check_tokenize('1234567 > ~0x15', """\
+    NUMBER     '1234567'     (1, 0) (1, 7)
+    GREATER    '>'           (1, 8) (1, 9)
+    TILDE      '~'           (1, 10) (1, 11)
+    NUMBER     '0x15'        (1, 11) (1, 15)
+    """)
+
+        self.check_tokenize('2134568 != 1231515', """\
+    NUMBER     '2134568'     (1, 0) (1, 7)
+    NOTEQUAL   '!='          (1, 8) (1, 10)
+    NUMBER     '1231515'     (1, 11) (1, 18)
+    """)
+
+        self.check_tokenize('(-124561-1) & 200000000', """\
+    LPAR       '('           (1, 0) (1, 1)
+    MINUS      '-'           (1, 1) (1, 2)
+    NUMBER     '124561'      (1, 2) (1, 8)
+    MINUS      '-'           (1, 8) (1, 9)
+    NUMBER     '1'           (1, 9) (1, 10)
+    RPAR       ')'           (1, 10) (1, 11)
+    AMPER      '&'           (1, 12) (1, 13)
+    NUMBER     '200000000'   (1, 14) (1, 23)
+    """)
+
+        self.check_tokenize('0xdeadbeef != -1', """\
+    NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
+    NOTEQUAL   '!='          (1, 11) (1, 13)
+    MINUS      '-'           (1, 14) (1, 15)
+    NUMBER     '1'           (1, 15) (1, 16)
+    """)
+
+        self.check_tokenize('0xdeadc0de & 12345', """\
+    NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
+    AMPER      '&'           (1, 11) (1, 12)
+    NUMBER     '12345'       (1, 13) (1, 18)
+    """)
+
+        self.check_tokenize('0xFF & 0x15 | 1234', """\
+    NUMBER     '0xFF'        (1, 0) (1, 4)
+    AMPER      '&'           (1, 5) (1, 6)
+    NUMBER     '0x15'        (1, 7) (1, 11)
+    VBAR       '|'           (1, 12) (1, 13)
+    NUMBER     '1234'        (1, 14) (1, 18)
+    """)
+
+    def test_float(self):
+
+        self.check_tokenize('x = 3.14159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3.14159'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = 314159.', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '314159.'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = .314159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '.314159'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = 3e14159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3e14159'     (1, 4) (1, 11)
+    """)
+
+        self.check_tokenize('x = 3E123', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3E123'       (1, 4) (1, 9)
+    """)
+
+        self.check_tokenize('x+y = 3e-1230', """\
+    NAME       'x'           (1, 0) (1, 1)
+    PLUS       '+'           (1, 1) (1, 2)
+    NAME       'y'           (1, 2) (1, 3)
+    EQUAL      '='           (1, 4) (1, 5)
+    NUMBER     '3e-1230'     (1, 6) (1, 13)
+    """)
+
+        self.check_tokenize('x = 3.14e159', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '3.14e159'    (1, 4) (1, 12)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_string(self):
+
+        self.check_tokenize('x = \'\'; y = ""', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     "''"          (1, 4) (1, 6)
+    SEMI       ';'           (1, 6) (1, 7)
+    NAME       'y'           (1, 8) (1, 9)
+    EQUAL      '='           (1, 10) (1, 11)
+    STRING     '""'          (1, 12) (1, 14)
+    """)
+
+        self.check_tokenize('x = \'"\'; y = "\'"', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     '\\'"\\''       (1, 4) (1, 7)
+    SEMI       ';'           (1, 7) (1, 8)
+    NAME       'y'           (1, 9) (1, 10)
+    EQUAL      '='           (1, 11) (1, 12)
+    STRING     '"\\'"'        (1, 13) (1, 16)
+    """)
+
+        self.check_tokenize('x = "doesn\'t "shrink", does it"', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     '"doesn\\'t "' (1, 4) (1, 14)
+    NAME       'shrink'      (1, 14) (1, 20)
+    STRING     '", does it"' (1, 20) (1, 31)
+    """)
+
+        self.check_tokenize("x = 'abc' + 'ABC'", """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     "'abc'"       (1, 4) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    STRING     "'ABC'"       (1, 12) (1, 17)
+    """)
+
+        self.check_tokenize('y = "ABC" + "ABC"', """\
+    NAME       'y'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     '"ABC"'       (1, 4) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    STRING     '"ABC"'       (1, 12) (1, 17)
+    """)
+
+        self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     "r'abc'"      (1, 4) (1, 10)
+    PLUS       '+'           (1, 11) (1, 12)
+    STRING     "r'ABC'"      (1, 13) (1, 19)
+    PLUS       '+'           (1, 20) (1, 21)
+    STRING     "R'ABC'"      (1, 22) (1, 28)
+    PLUS       '+'           (1, 29) (1, 30)
+    STRING     "R'ABC'"      (1, 31) (1, 37)
+    """)
+
+        self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
+    NAME       'y'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    STRING     'r"abc"'      (1, 4) (1, 10)
+    PLUS       '+'           (1, 11) (1, 12)
+    STRING     'r"ABC"'      (1, 13) (1, 19)
+    PLUS       '+'           (1, 20) (1, 21)
+    STRING     'R"ABC"'      (1, 22) (1, 28)
+    PLUS       '+'           (1, 29) (1, 30)
+    STRING     'R"ABC"'      (1, 31) (1, 37)
+    """)
+
+        self.check_tokenize("u'abc' + U'abc'", """\
+    STRING     "u'abc'"      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     "U'abc'"      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize('u"abc" + U"abc"', """\
+    STRING     'u"abc"'      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     'U"abc"'      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize("b'abc' + B'abc'", """\
+    STRING     "b'abc'"      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     "B'abc'"      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize('b"abc" + B"abc"', """\
+    STRING     'b"abc"'      (1, 0) (1, 6)
+    PLUS       '+'           (1, 7) (1, 8)
+    STRING     'B"abc"'      (1, 9) (1, 15)
+    """)
+
+        self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
+    STRING     "br'abc'"     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     "bR'abc'"     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     "Br'abc'"     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     "BR'abc'"     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
+    STRING     'br"abc"'     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     'bR"abc"'     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     'Br"abc"'     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     'BR"abc"'     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
+    STRING     "rb'abc'"     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     "rB'abc'"     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     "Rb'abc'"     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     "RB'abc'"     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
+    STRING     'rb"abc"'     (1, 0) (1, 7)
+    PLUS       '+'           (1, 8) (1, 9)
+    STRING     'rB"abc"'     (1, 10) (1, 17)
+    PLUS       '+'           (1, 18) (1, 19)
+    STRING     'Rb"abc"'     (1, 20) (1, 27)
+    PLUS       '+'           (1, 28) (1, 29)
+    STRING     'RB"abc"'     (1, 30) (1, 37)
+    """)
+
+        self.check_tokenize('"a\\\nde\\\nfg"', """\
+    STRING     '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
+    """)
+
+        self.check_tokenize('u"a\\\nde"', """\
+    STRING     'u"a\\\\\\nde"\'  (1, 0) (2, 3)
+    """)
+
+        self.check_tokenize('rb"a\\\nd"', """\
+    STRING     'rb"a\\\\\\nd"\'  (1, 0) (2, 2)
+    """)
+
+        self.check_tokenize(r'"""a\
+b"""', """\
+    STRING     '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
+    """)
+        self.check_tokenize(r'u"""a\
+b"""', """\
+    STRING     'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
+    """)
+        self.check_tokenize(r'rb"""a\
+b\
+c"""', """\
+    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
+    """)
+
+        self.check_tokenize(r'"hola\\\r\ndfgf"', """\
+    STRING     \'"hola\\\\\\\\\\\\r\\\\ndfgf"\' (1, 0) (1, 16)
+    """)
+
+        self.check_tokenize('f"abc"', """\
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc'         (1, 2) (1, 5)
+    FSTRING_END '"'           (1, 5) (1, 6)
+    """)
+
+        self.check_tokenize('fR"a{b}c"', """\
+    FSTRING_START 'fR"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'a'           (1, 3) (1, 4)
+    LBRACE     '{'           (1, 4) (1, 5)
+    NAME       'b'           (1, 5) (1, 6)
+    RBRACE     '}'           (1, 6) (1, 7)
+    FSTRING_MIDDLE 'c'           (1, 7) (1, 8)
+    FSTRING_END '"'           (1, 8) (1, 9)
+    """)
+
+        self.check_tokenize('f"""abc"""', """\
+    FSTRING_START 'f\"""'        (1, 0) (1, 4)
+    FSTRING_MIDDLE 'abc'         (1, 4) (1, 7)
+    FSTRING_END '\"""'         (1, 7) (1, 10)
+    """)
+
+        self.check_tokenize(r'f"abc\
+def"', """\
+    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 2) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
+    """)
+
+        self.check_tokenize('''\
+f"{
+a}"''', """\
+    FSTRING_START 'f"'          (1, 0) (1, 2)
+    LBRACE     '{'           (1, 2) (1, 3)
+    NAME       'a'           (2, 0) (2, 1)
+    RBRACE     '}'           (2, 1) (2, 2)
+    FSTRING_END '"'           (2, 2) (2, 3)
+    """)
+
+        self.check_tokenize(r'Rf"abc\
+def"', """\
+    FSTRING_START 'Rf"'         (1, 0) (1, 3)
+    FSTRING_MIDDLE 'abc\\\\\\ndef'  (1, 3) (2, 3)
+    FSTRING_END '"'           (2, 3) (2, 4)
+    """)
+
+        self.check_tokenize(r'f"hola\\\r\ndfgf"', """\
+    FSTRING_START \'f"\'          (1, 0) (1, 2)
+    FSTRING_MIDDLE 'hola\\\\\\\\\\\\r\\\\ndfgf' (1, 2) (1, 16)
+    FSTRING_END \'"\'           (1, 16) (1, 17)
+    """)
+
+        self.check_tokenize("""\
+f'''__{
+    x:a
+}__'''""", """\
+    FSTRING_START "f'''"        (1, 0) (1, 4)
+    FSTRING_MIDDLE '__'          (1, 4) (1, 6)
+    LBRACE     '{'           (1, 6) (1, 7)
+    NAME       'x'           (2, 4) (2, 5)
+    COLON      ':'           (2, 5) (2, 6)
+    FSTRING_MIDDLE 'a\\n'         (2, 6) (3, 0)
+    RBRACE     '}'           (3, 0) (3, 1)
+    FSTRING_MIDDLE '__'          (3, 1) (3, 3)
+    FSTRING_END "'''"         (3, 3) (3, 6)
+    """)
+
+        self.check_tokenize("""\
+f'''__{
+    x:a
+    b
+     c
+      d
+}__'''""", """\
+    FSTRING_START "f'''"        (1, 0) (1, 4)
+    FSTRING_MIDDLE '__'          (1, 4) (1, 6)
+    LBRACE     '{'           (1, 6) (1, 7)
+    NAME       'x'           (2, 4) (2, 5)
+    COLON      ':'           (2, 5) (2, 6)
+    FSTRING_MIDDLE 'a\\n    b\\n     c\\n      d\\n' (2, 6) (6, 0)
+    RBRACE     '}'           (6, 0) (6, 1)
+    FSTRING_MIDDLE '__'          (6, 1) (6, 3)
+    FSTRING_END "'''"         (6, 3) (6, 6)
+    """)
+
+    def test_function(self):
+
+        self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'd22'         (1, 4) (1, 7)
+    LPAR       '('           (1, 7) (1, 8)
+    NAME       'a'           (1, 8) (1, 9)
+    COMMA      ','           (1, 9) (1, 10)
+    NAME       'b'           (1, 11) (1, 12)
+    COMMA      ','           (1, 12) (1, 13)
+    NAME       'c'           (1, 14) (1, 15)
+    EQUAL      '='           (1, 15) (1, 16)
+    NUMBER     '2'           (1, 16) (1, 17)
+    COMMA      ','           (1, 17) (1, 18)
+    NAME       'd'           (1, 19) (1, 20)
+    EQUAL      '='           (1, 20) (1, 21)
+    NUMBER     '2'           (1, 21) (1, 22)
+    COMMA      ','           (1, 22) (1, 23)
+    STAR       '*'           (1, 24) (1, 25)
+    NAME       'k'           (1, 25) (1, 26)
+    RPAR       ')'           (1, 26) (1, 27)
+    COLON      ':'           (1, 27) (1, 28)
+    NAME       'pass'        (1, 29) (1, 33)
+    """)
+
+        self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'd01v_'       (1, 4) (1, 9)
+    LPAR       '('           (1, 9) (1, 10)
+    NAME       'a'           (1, 10) (1, 11)
+    EQUAL      '='           (1, 11) (1, 12)
+    NUMBER     '1'           (1, 12) (1, 13)
+    COMMA      ','           (1, 13) (1, 14)
+    STAR       '*'           (1, 15) (1, 16)
+    NAME       'k'           (1, 16) (1, 17)
+    COMMA      ','           (1, 17) (1, 18)
+    DOUBLESTAR '**'          (1, 19) (1, 21)
+    NAME       'w'           (1, 21) (1, 22)
+    RPAR       ')'           (1, 22) (1, 23)
+    COLON      ':'           (1, 23) (1, 24)
+    NAME       'pass'        (1, 25) (1, 29)
+    """)
+
+        self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'd23'         (1, 4) (1, 7)
+    LPAR       '('           (1, 7) (1, 8)
+    NAME       'a'           (1, 8) (1, 9)
+    COLON      ':'           (1, 9) (1, 10)
+    NAME       'str'         (1, 11) (1, 14)
+    COMMA      ','           (1, 14) (1, 15)
+    NAME       'b'           (1, 16) (1, 17)
+    COLON      ':'           (1, 17) (1, 18)
+    NAME       'int'         (1, 19) (1, 22)
+    EQUAL      '='           (1, 22) (1, 23)
+    NUMBER     '3'           (1, 23) (1, 24)
+    RPAR       ')'           (1, 24) (1, 25)
+    RARROW     '->'          (1, 26) (1, 28)
+    NAME       'int'         (1, 29) (1, 32)
+    COLON      ':'           (1, 32) (1, 33)
+    NAME       'pass'        (1, 34) (1, 38)
+    """)
+
+    def test_comparison(self):
+
+        self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
+                            "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
+    NAME       'if'          (1, 0) (1, 2)
+    NUMBER     '1'           (1, 3) (1, 4)
+    LESS       '<'           (1, 5) (1, 6)
+    NUMBER     '1'           (1, 7) (1, 8)
+    GREATER    '>'           (1, 9) (1, 10)
+    NUMBER     '1'           (1, 11) (1, 12)
+    EQEQUAL    '=='          (1, 13) (1, 15)
+    NUMBER     '1'           (1, 16) (1, 17)
+    GREATEREQUAL '>='          (1, 18) (1, 20)
+    NUMBER     '5'           (1, 21) (1, 22)
+    LESSEQUAL  '<='          (1, 23) (1, 25)
+    NUMBER     '0x15'        (1, 26) (1, 30)
+    LESSEQUAL  '<='          (1, 31) (1, 33)
+    NUMBER     '0x12'        (1, 34) (1, 38)
+    NOTEQUAL   '!='          (1, 39) (1, 41)
+    NUMBER     '1'           (1, 42) (1, 43)
+    NAME       'and'         (1, 44) (1, 47)
+    NUMBER     '5'           (1, 48) (1, 49)
+    NAME       'in'          (1, 50) (1, 52)
+    NUMBER     '1'           (1, 53) (1, 54)
+    NAME       'not'         (1, 55) (1, 58)
+    NAME       'in'          (1, 59) (1, 61)
+    NUMBER     '1'           (1, 62) (1, 63)
+    NAME       'is'          (1, 64) (1, 66)
+    NUMBER     '1'           (1, 67) (1, 68)
+    NAME       'or'          (1, 69) (1, 71)
+    NUMBER     '5'           (1, 72) (1, 73)
+    NAME       'is'          (1, 74) (1, 76)
+    NAME       'not'         (1, 77) (1, 80)
+    NUMBER     '1'           (1, 81) (1, 82)
+    COLON      ':'           (1, 82) (1, 83)
+    NAME       'pass'        (1, 84) (1, 88)
+    """)
+
+    def test_additive(self):
+
+        self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 4) (1, 5)
+    MINUS      '-'           (1, 6) (1, 7)
+    NAME       'y'           (1, 8) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    NUMBER     '15'          (1, 12) (1, 14)
+    MINUS      '-'           (1, 15) (1, 16)
+    NUMBER     '1'           (1, 17) (1, 18)
+    PLUS       '+'           (1, 19) (1, 20)
+    NUMBER     '0x124'       (1, 21) (1, 26)
+    PLUS       '+'           (1, 27) (1, 28)
+    NAME       'z'           (1, 29) (1, 30)
+    PLUS       '+'           (1, 31) (1, 32)
+    NAME       'a'           (1, 33) (1, 34)
+    LSQB       '['           (1, 34) (1, 35)
+    NUMBER     '5'           (1, 35) (1, 36)
+    RSQB       ']'           (1, 36) (1, 37)
+    """)
+
+    def test_multiplicative(self):
+
+        self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\
+    NAME       'x'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 4) (1, 5)
+    DOUBLESLASH '//'          (1, 5) (1, 7)
+    NUMBER     '1'           (1, 7) (1, 8)
+    STAR       '*'           (1, 8) (1, 9)
+    NUMBER     '1'           (1, 9) (1, 10)
+    SLASH      '/'           (1, 10) (1, 11)
+    NUMBER     '5'           (1, 11) (1, 12)
+    STAR       '*'           (1, 12) (1, 13)
+    NUMBER     '12'          (1, 13) (1, 15)
+    PERCENT    '%'           (1, 15) (1, 16)
+    NUMBER     '0x12'        (1, 16) (1, 20)
+    AT         '@'           (1, 20) (1, 21)
+    NUMBER     '42'          (1, 21) (1, 23)
+    """)
+
+    def test_unary(self):
+
+        self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\
+    TILDE      '~'           (1, 0) (1, 1)
+    NUMBER     '1'           (1, 1) (1, 2)
+    CIRCUMFLEX '^'           (1, 3) (1, 4)
+    NUMBER     '1'           (1, 5) (1, 6)
+    AMPER      '&'           (1, 7) (1, 8)
+    NUMBER     '1'           (1, 9) (1, 10)
+    VBAR       '|'           (1, 11) (1, 12)
+    NUMBER     '1'           (1, 12) (1, 13)
+    CIRCUMFLEX '^'           (1, 14) (1, 15)
+    MINUS      '-'           (1, 16) (1, 17)
+    NUMBER     '1'           (1, 17) (1, 18)
+    """)
+
+        self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\
+    MINUS      '-'           (1, 0) (1, 1)
+    NUMBER     '1'           (1, 1) (1, 2)
+    STAR       '*'           (1, 2) (1, 3)
+    NUMBER     '1'           (1, 3) (1, 4)
+    SLASH      '/'           (1, 4) (1, 5)
+    NUMBER     '1'           (1, 5) (1, 6)
+    PLUS       '+'           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 7) (1, 8)
+    STAR       '*'           (1, 8) (1, 9)
+    NUMBER     '1'           (1, 9) (1, 10)
+    DOUBLESLASH '//'          (1, 10) (1, 12)
+    NUMBER     '1'           (1, 12) (1, 13)
+    MINUS      '-'           (1, 14) (1, 15)
+    MINUS      '-'           (1, 16) (1, 17)
+    MINUS      '-'           (1, 17) (1, 18)
+    MINUS      '-'           (1, 18) (1, 19)
+    NUMBER     '1'           (1, 19) (1, 20)
+    DOUBLESTAR '**'          (1, 20) (1, 22)
+    NUMBER     '1'           (1, 22) (1, 23)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_selector(self):
+
+        self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
+    NAME       'import'      (1, 0) (1, 6)
+    NAME       'sys'         (1, 7) (1, 10)
+    COMMA      ','           (1, 10) (1, 11)
+    NAME       'time'        (1, 12) (1, 16)
+    NEWLINE    ''            (1, 16) (1, 16)
+    NAME       'x'           (2, 0) (2, 1)
+    EQUAL      '='           (2, 2) (2, 3)
+    NAME       'sys'         (2, 4) (2, 7)
+    DOT        '.'           (2, 7) (2, 8)
+    NAME       'modules'     (2, 8) (2, 15)
+    LSQB       '['           (2, 15) (2, 16)
+    STRING     "'time'"      (2, 16) (2, 22)
+    RSQB       ']'           (2, 22) (2, 23)
+    DOT        '.'           (2, 23) (2, 24)
+    NAME       'time'        (2, 24) (2, 28)
+    LPAR       '('           (2, 28) (2, 29)
+    RPAR       ')'           (2, 29) (2, 30)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_method(self):
+
+        self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
+    AT         '@'           (1, 0) (1, 1)
+    NAME       'staticmethod' (1, 1) (1, 13)
+    NEWLINE    ''            (1, 13) (1, 13)
+    NAME       'def'         (2, 0) (2, 3)
+    NAME       'foo'         (2, 4) (2, 7)
+    LPAR       '('           (2, 7) (2, 8)
+    NAME       'x'           (2, 8) (2, 9)
+    COMMA      ','           (2, 9) (2, 10)
+    NAME       'y'           (2, 10) (2, 11)
+    RPAR       ')'           (2, 11) (2, 12)
+    COLON      ':'           (2, 12) (2, 13)
+    NAME       'pass'        (2, 14) (2, 18)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_tabs(self):
+
+        self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
+    AT         '@'           (1, 0) (1, 1)
+    NAME       'staticmethod' (1, 1) (1, 13)
+    NEWLINE    ''            (1, 13) (1, 13)
+    NAME       'def'         (2, 0) (2, 3)
+    NAME       'foo'         (2, 4) (2, 7)
+    LPAR       '('           (2, 7) (2, 8)
+    NAME       'x'           (2, 8) (2, 9)
+    COMMA      ','           (2, 9) (2, 10)
+    NAME       'y'           (2, 10) (2, 11)
+    RPAR       ')'           (2, 11) (2, 12)
+    COLON      ':'           (2, 12) (2, 13)
+    NAME       'pass'        (2, 14) (2, 18)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_async(self):
+
+        self.check_tokenize('async = 1', """\
+    NAME       'async'       (1, 0) (1, 5)
+    EQUAL      '='           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 8) (1, 9)
+    """)
+
+        self.check_tokenize('a = (async = 1)', """\
+    NAME       'a'           (1, 0) (1, 1)
+    EQUAL      '='           (1, 2) (1, 3)
+    LPAR       '('           (1, 4) (1, 5)
+    NAME       'async'       (1, 5) (1, 10)
+    EQUAL      '='           (1, 11) (1, 12)
+    NUMBER     '1'           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    """)
+
+        self.check_tokenize('async()', """\
+    NAME       'async'       (1, 0) (1, 5)
+    LPAR       '('           (1, 5) (1, 6)
+    RPAR       ')'           (1, 6) (1, 7)
+    """)
+
+        self.check_tokenize('class async(Bar):pass', """\
+    NAME       'class'       (1, 0) (1, 5)
+    NAME       'async'       (1, 6) (1, 11)
+    LPAR       '('           (1, 11) (1, 12)
+    NAME       'Bar'         (1, 12) (1, 15)
+    RPAR       ')'           (1, 15) (1, 16)
+    COLON      ':'           (1, 16) (1, 17)
+    NAME       'pass'        (1, 17) (1, 21)
+    """)
+
+        self.check_tokenize('class async:pass', """\
+    NAME       'class'       (1, 0) (1, 5)
+    NAME       'async'       (1, 6) (1, 11)
+    COLON      ':'           (1, 11) (1, 12)
+    NAME       'pass'        (1, 12) (1, 16)
+    """)
+
+        self.check_tokenize('await = 1', """\
+    NAME       'await'       (1, 0) (1, 5)
+    EQUAL      '='           (1, 6) (1, 7)
+    NUMBER     '1'           (1, 8) (1, 9)
+    """)
+
+        self.check_tokenize('foo.async', """\
+    NAME       'foo'         (1, 0) (1, 3)
+    DOT        '.'           (1, 3) (1, 4)
+    NAME       'async'       (1, 4) (1, 9)
+    """)
+
+        self.check_tokenize('async for a in b: pass', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NAME       'for'         (1, 6) (1, 9)
+    NAME       'a'           (1, 10) (1, 11)
+    NAME       'in'          (1, 12) (1, 14)
+    NAME       'b'           (1, 15) (1, 16)
+    COLON      ':'           (1, 16) (1, 17)
+    NAME       'pass'        (1, 18) (1, 22)
+    """)
+
+        self.check_tokenize('async with a as b: pass', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NAME       'with'        (1, 6) (1, 10)
+    NAME       'a'           (1, 11) (1, 12)
+    NAME       'as'          (1, 13) (1, 15)
+    NAME       'b'           (1, 16) (1, 17)
+    COLON      ':'           (1, 17) (1, 18)
+    NAME       'pass'        (1, 19) (1, 23)
+    """)
+
+        self.check_tokenize('async.foo', """\
+    NAME       'async'       (1, 0) (1, 5)
+    DOT        '.'           (1, 5) (1, 6)
+    NAME       'foo'         (1, 6) (1, 9)
+    """)
+
+        self.check_tokenize('async', """\
+    NAME       'async'       (1, 0) (1, 5)
+    """)
+
+        self.check_tokenize('async\n#comment\nawait', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NEWLINE    ''            (1, 5) (1, 5)
+    NAME       'await'       (3, 0) (3, 5)
+    """)
+
+        self.check_tokenize('async\n...\nawait', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NEWLINE    ''            (1, 5) (1, 5)
+    ELLIPSIS   '...'         (2, 0) (2, 3)
+    NEWLINE    ''            (2, 3) (2, 3)
+    NAME       'await'       (3, 0) (3, 5)
+    """)
+
+        self.check_tokenize('async\nawait', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NEWLINE    ''            (1, 5) (1, 5)
+    NAME       'await'       (2, 0) (2, 5)
+    """)
+
+        self.check_tokenize('foo.async + 1', """\
+    NAME       'foo'         (1, 0) (1, 3)
+    DOT        '.'           (1, 3) (1, 4)
+    NAME       'async'       (1, 4) (1, 9)
+    PLUS       '+'           (1, 10) (1, 11)
+    NUMBER     '1'           (1, 12) (1, 13)
+    """)
+
+        self.check_tokenize('async def foo(): pass', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    COLON      ':'           (1, 15) (1, 16)
+    NAME       'pass'        (1, 17) (1, 21)
+    """)
+
+        self.check_tokenize('''\
+async def foo():
+  def foo(await):
+    await = 1
+  if 1:
+    await
+async += 1
+''', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    COLON      ':'           (1, 15) (1, 16)
+    NEWLINE    ''            (1, 16) (1, 16)
+    INDENT     ''            (2, -1) (2, -1)
+    NAME       'def'         (2, 2) (2, 5)
+    NAME       'foo'         (2, 6) (2, 9)
+    LPAR       '('           (2, 9) (2, 10)
+    NAME       'await'       (2, 10) (2, 15)
+    RPAR       ')'           (2, 15) (2, 16)
+    COLON      ':'           (2, 16) (2, 17)
+    NEWLINE    ''            (2, 17) (2, 17)
+    INDENT     ''            (3, -1) (3, -1)
+    NAME       'await'       (3, 4) (3, 9)
+    EQUAL      '='           (3, 10) (3, 11)
+    NUMBER     '1'           (3, 12) (3, 13)
+    NEWLINE    ''            (3, 13) (3, 13)
+    DEDENT     ''            (4, -1) (4, -1)
+    NAME       'if'          (4, 2) (4, 4)
+    NUMBER     '1'           (4, 5) (4, 6)
+    COLON      ':'           (4, 6) (4, 7)
+    NEWLINE    ''            (4, 7) (4, 7)
+    INDENT     ''            (5, -1) (5, -1)
+    NAME       'await'       (5, 4) (5, 9)
+    NEWLINE    ''            (5, 9) (5, 9)
+    DEDENT     ''            (6, -1) (6, -1)
+    DEDENT     ''            (6, -1) (6, -1)
+    NAME       'async'       (6, 0) (6, 5)
+    PLUSEQUAL  '+='          (6, 6) (6, 8)
+    NUMBER     '1'           (6, 9) (6, 10)
+    NEWLINE    ''            (6, 10) (6, 10)
+    """)
+
+        self.check_tokenize('async def foo():\n  async for i in 1: pass', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    RPAR       ')'           (1, 14) (1, 15)
+    COLON      ':'           (1, 15) (1, 16)
+    NEWLINE    ''            (1, 16) (1, 16)
+    INDENT     ''            (2, -1) (2, -1)
+    NAME       'async'       (2, 2) (2, 7)
+    NAME       'for'         (2, 8) (2, 11)
+    NAME       'i'           (2, 12) (2, 13)
+    NAME       'in'          (2, 14) (2, 16)
+    NUMBER     '1'           (2, 17) (2, 18)
+    COLON      ':'           (2, 18) (2, 19)
+    NAME       'pass'        (2, 20) (2, 24)
+    DEDENT     ''            (2, -1) (2, -1)
+    """)
+
+        self.check_tokenize('async def foo(async): await', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'foo'         (1, 10) (1, 13)
+    LPAR       '('           (1, 13) (1, 14)
+    NAME       'async'       (1, 14) (1, 19)
+    RPAR       ')'           (1, 19) (1, 20)
+    COLON      ':'           (1, 20) (1, 21)
+    NAME       'await'       (1, 22) (1, 27)
+    """)
+
+        self.check_tokenize('''\
+def f():
+
+  def baz(): pass
+  async def bar(): pass
+
+  await = 2''', """\
+    NAME       'def'         (1, 0) (1, 3)
+    NAME       'f'           (1, 4) (1, 5)
+    LPAR       '('           (1, 5) (1, 6)
+    RPAR       ')'           (1, 6) (1, 7)
+    COLON      ':'           (1, 7) (1, 8)
+    NEWLINE    ''            (1, 8) (1, 8)
+    INDENT     ''            (3, -1) (3, -1)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    LPAR       '('           (3, 9) (3, 10)
+    RPAR       ')'           (3, 10) (3, 11)
+    COLON      ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    ''            (3, 17) (3, 17)
+    NAME       'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    LPAR       '('           (4, 15) (4, 16)
+    RPAR       ')'           (4, 16) (4, 17)
+    COLON      ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    ''            (4, 23) (4, 23)
+    NAME       'await'       (6, 2) (6, 7)
+    EQUAL      '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (6, -1) (6, -1)
+    """)
+
+        self.check_tokenize('''\
+async def f():
+
+  def baz(): pass
+  async def bar(): pass
+
+  await = 2''', """\
+    NAME       'async'       (1, 0) (1, 5)
+    NAME       'def'         (1, 6) (1, 9)
+    NAME       'f'           (1, 10) (1, 11)
+    LPAR       '('           (1, 11) (1, 12)
+    RPAR       ')'           (1, 12) (1, 13)
+    COLON      ':'           (1, 13) (1, 14)
+    NEWLINE    ''            (1, 14) (1, 14)
+    INDENT     ''            (3, -1) (3, -1)
+    NAME       'def'         (3, 2) (3, 5)
+    NAME       'baz'         (3, 6) (3, 9)
+    LPAR       '('           (3, 9) (3, 10)
+    RPAR       ')'           (3, 10) (3, 11)
+    COLON      ':'           (3, 11) (3, 12)
+    NAME       'pass'        (3, 13) (3, 17)
+    NEWLINE    ''            (3, 17) (3, 17)
+    NAME       'async'       (4, 2) (4, 7)
+    NAME       'def'         (4, 8) (4, 11)
+    NAME       'bar'         (4, 12) (4, 15)
+    LPAR       '('           (4, 15) (4, 16)
+    RPAR       ')'           (4, 16) (4, 17)
+    COLON      ':'           (4, 17) (4, 18)
+    NAME       'pass'        (4, 19) (4, 23)
+    NEWLINE    ''            (4, 23) (4, 23)
+    NAME       'await'       (6, 2) (6, 7)
+    EQUAL      '='           (6, 8) (6, 9)
+    NUMBER     '2'           (6, 10) (6, 11)
+    DEDENT     ''            (6, -1) (6, -1)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_unicode(self):
+
+        self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
+    NAME       'Örter'       (1, 0) (1, 5)
+    EQUAL      '='           (1, 6) (1, 7)
+    STRING     "u'places'"   (1, 8) (1, 17)
+    NEWLINE    ''            (1, 17) (1, 17)
+    NAME       'grün'        (2, 0) (2, 4)
+    EQUAL      '='           (2, 5) (2, 6)
+    STRING     "U'green'"    (2, 7) (2, 15)
+    """)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_invalid_syntax(self):
+        def get_tokens(string):
+            the_string = StringIO(string)
+            return list(tokenize._generate_tokens_from_c_tokenizer(the_string.readline))
+
+        for case in [
+            "(1+2]",
+            "(1+2}",
+            "{1+2]",
+            "1_",
+            "1.2_",
+            "1e2_",
+            "1e+",
+
+            "\xa0",
+            "€",
+            "0b12",
+            "0b1_2",
+            "0b2",
+            "0b1_",
+            "0b",
+            "0o18",
+            "0o1_8",
+            "0o8",
+            "0o1_",
+            "0o",
+            "0x1_",
+            "0x",
+            "1_",
+            "012",
+            "1.2_",
+            "1e2_",
+            "1e+",
+            "'sdfsdf",
+            "'''sdfsdf''",
+            "("*1000+"a"+")"*1000,
+            "]",
+            """\
+            f'__{
+                x:d
+            }__'""",
+            " a\n\x00",
+        ]:
+            with self.subTest(case=case):
+                self.assertRaises(tokenize.TokenError, get_tokens, case)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; AssertionError: IndentationError not raised by <lambda>
+    @support.skip_wasi_stack_overflow()
+    def test_max_indent(self):
+        MAXINDENT = 100
+
+        def generate_source(indents):
+            source = ''.join(('  ' * x) + 'if True:\n' for x in range(indents))
+            source += '  ' * indents + 'pass\n'
+            return source
+
+        valid = generate_source(MAXINDENT - 1)
+        the_input = StringIO(valid)
+        tokens = list(tokenize._generate_tokens_from_c_tokenizer(the_input.readline))
+        self.assertEqual(tokens[-2].type, tokenize.DEDENT)
+        self.assertEqual(tokens[-1].type, tokenize.ENDMARKER)
+        compile(valid, "<string>", "exec")
+
+        invalid = generate_source(MAXINDENT)
+        the_input = StringIO(invalid)
+        self.assertRaises(IndentationError, lambda: list(tokenize._generate_tokens_from_c_tokenizer(the_input.readline)))
+        self.assertRaises(
+            IndentationError, compile, invalid, "<string>", "exec"
+        )
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON; (0, '')]
+    def test_continuation_lines_indentation(self):
+        def get_tokens(string):
+            the_string = StringIO(string)
+            return [(kind, string) for (kind, string, *_)
+                    in tokenize._generate_tokens_from_c_tokenizer(the_string.readline)]
+
+        code = dedent("""
+            def fib(n):
+                \\
+            '''Print a Fibonacci series up to n.'''
+                \\
+            a, b = 0, 1
+        """)
+
+        self.check_tokenize(code, """\
+    NAME       'def'         (2, 0) (2, 3)
+    NAME       'fib'         (2, 4) (2, 7)
+    LPAR       '('           (2, 7) (2, 8)
+    NAME       'n'           (2, 8) (2, 9)
+    RPAR       ')'           (2, 9) (2, 10)
+    COLON      ':'           (2, 10) (2, 11)
+    NEWLINE    ''            (2, 11) (2, 11)
+    INDENT     ''            (4, -1) (4, -1)
+    STRING     "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39)
+    NEWLINE    ''            (4, 39) (4, 39)
+    NAME       'a'           (6, 0) (6, 1)
+    COMMA      ','           (6, 1) (6, 2)
+    NAME       'b'           (6, 3) (6, 4)
+    EQUAL      '='           (6, 5) (6, 6)
+    NUMBER     '0'           (6, 7) (6, 8)
+    COMMA      ','           (6, 8) (6, 9)
+    NUMBER     '1'           (6, 10) (6, 11)
+    NEWLINE    ''            (6, 11) (6, 11)
+    DEDENT     ''            (6, -1) (6, -1)
+        """)
+
+        code_no_cont = dedent("""
+            def fib(n):
+                '''Print a Fibonacci series up to n.'''
+                a, b = 0, 1
+        """)
+
+        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
+        code = dedent("""
+            pass
+                \\
+
+            pass
+        """)
+
+        self.check_tokenize(code, """\
+    NAME       'pass'        (2, 0) (2, 4)
+    NEWLINE    ''            (2, 4) (2, 4)
+    NAME       'pass'        (5, 0) (5, 4)
+    NEWLINE    ''            (5, 4) (5, 4)
+        """)
+
+        code_no_cont = dedent("""
+            pass
+            pass
+        """)
+
+        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
+        code = dedent("""
+            if x:
+                y = 1
+                \\
+                        \\
+                    \\
+                \\
+                foo = 1
+        """)
+
+        self.check_tokenize(code, """\
+    NAME       'if'          (2, 0) (2, 2)
+    NAME       'x'           (2, 3) (2, 4)
+    COLON      ':'           (2, 4) (2, 5)
+    NEWLINE    ''            (2, 5) (2, 5)
+    INDENT     ''            (3, -1) (3, -1)
+    NAME       'y'           (3, 4) (3, 5)
+    EQUAL      '='           (3, 6) (3, 7)
+    NUMBER     '1'           (3, 8) (3, 9)
+    NEWLINE    ''            (3, 9) (3, 9)
+    NAME       'foo'         (8, 4) (8, 7)
+    EQUAL      '='           (8, 8) (8, 9)
+    NUMBER     '1'           (8, 10) (8, 11)
+    NEWLINE    ''            (8, 11) (8, 11)
+    DEDENT     ''            (8, -1) (8, -1)
+        """)
+
+        code_no_cont = dedent("""
+            if x:
+                y = 1
+                foo = 1
+        """)
+
+        self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
+
+
+class CTokenizerBufferTests(unittest.TestCase):
+    def test_newline_at_the_end_of_buffer(self):
+        # See issue 99581: Make sure that if we need to add a new line at the
+        # end of the buffer, we have enough space in the buffer, specially when
+        # the current line is as long as the buffer space available.
+        test_script = f"""\
+        #coding: latin-1
+        #{"a"*10000}
+        #{"a"*10002}"""
+        with os_helper.temp_dir() as temp_dir:
+            file_name = make_script(temp_dir, 'foo', test_script)
+            run_test_script(file_name)
+
+
+class CommandLineTest(unittest.TestCase):
+    def setUp(self):
+        self.filename = tempfile.mktemp()
+        self.addCleanup(os_helper.unlink, self.filename)
+
+    @staticmethod
+    def text_normalize(string):
+        """Dedent *string* and strip it from its surrounding whitespaces.
+
+        This method is used by the other utility functions so that any
+        string to write or to match against can be freely indented.
+        """
+        return re.sub(r'\s+', ' ', string).strip()
+
+    def set_source(self, content):
+        with open(self.filename, 'w') as fp:
+            fp.write(content)
+
+    def invoke_tokenize(self, *flags):
+        output = StringIO()
+        with contextlib.redirect_stdout(output):
+            tokenize._main(args=[*flags, self.filename])
+        return self.text_normalize(output.getvalue())
+
+    def check_output(self, source, expect, *flags):
+        with self.subTest(source=source, flags=flags):
+            self.set_source(source)
+            res = self.invoke_tokenize(*flags)
+            expect = self.text_normalize(expect)
+            self.assertListEqual(res.splitlines(), expect.splitlines())
+
+    def test_invocation(self):
+        # test various combinations of parameters
+        base_flags = ('-e', '--exact')
+
+        self.set_source('''
+            def f():
+                print(x)
+                return None
+        ''')
+
+        for flag in base_flags:
+            with self.subTest(args=flag):
+                _ = self.invoke_tokenize(flag)
+
+        with self.assertRaises(SystemExit):
+            # suppress argparse error message
+            with contextlib.redirect_stderr(StringIO()):
+                _ = self.invoke_tokenize('--unknown')
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_without_flag(self):
+        # test 'python -m tokenize source.py'
+        source = 'a = 1'
+        expect = '''
+            0,0-0,0:            ENCODING       'utf-8'
+            1,0-1,1:            NAME           'a'
+            1,2-1,3:            OP             '='
+            1,4-1,5:            NUMBER         '1'
+            1,5-1,6:            NEWLINE        ''
+            2,0-2,0:            ENDMARKER      ''
+        '''
+        self.check_output(source, expect)
+
+    @unittest.expectedFailure  # TODO: RUSTPYTHON
+    def test_exact_flag(self):
+        # test 'python -m tokenize -e/--exact source.py'
+        source = 'a = 1'
+        expect = '''
+            0,0-0,0:            ENCODING       'utf-8'
+            1,0-1,1:            NAME           'a'
+            1,2-1,3:            EQUAL          '='
+            1,4-1,5:            NUMBER         '1'
+            1,5-1,6:            NEWLINE        ''
+            2,0-2,0:            ENDMARKER      ''
+        '''
+        for flag in ['-e', '--exact']:
+            self.check_output(source, expect, flag)
+
+
+class StringPrefixTest(unittest.TestCase):
+    @staticmethod
+    def determine_valid_prefixes():
+        # Try all lengths until we find a length that has zero valid
+        # prefixes.  This will miss the case where for example there
+        # are no valid 3 character prefixes, but there are valid 4
+        # character prefixes.  That seems unlikely.
+
+        single_char_valid_prefixes = set()
+
+        # Find all of the single character string prefixes. Just get
+        # the lowercase version, we'll deal with combinations of upper
+        # and lower case later.  I'm using this logic just in case
+        # some uppercase-only prefix is added.
+        for letter in itertools.chain(string.ascii_lowercase, string.ascii_uppercase):
+            try:
+                eval(f'{letter}""')
+                single_char_valid_prefixes.add(letter.lower())
+            except SyntaxError:
+                pass
+
+        # This logic assumes that all combinations of valid prefixes only use
+        # the characters that are valid single character prefixes.  That seems
+        # like a valid assumption, but if it ever changes this will need
+        # adjusting.
+        valid_prefixes = set()
+        for length in itertools.count():
+            num_at_this_length = 0
+            for prefix in (
+                "".join(l)
+                for l in itertools.combinations(single_char_valid_prefixes, length)
+            ):
+                for t in itertools.permutations(prefix):
+                    for u in itertools.product(*[(c, c.upper()) for c in t]):
+                        p = "".join(u)
+                        if p == "not":
+                            # 'not' can never be a string prefix,
+                            # because it's a valid expression: not ""
+                            continue
+                        try:
+                            eval(f'{p}""')
+
+                            # No syntax error, so p is a valid string
+                            # prefix.
+
+                            valid_prefixes.add(p)
+                            num_at_this_length += 1
+                        except SyntaxError:
+                            pass
+            if num_at_this_length == 0:
+                return valid_prefixes
+
+
+    def test_prefixes(self):
+        # Get the list of defined string prefixes.  I don't see an
+        # obvious documented way of doing this, but probably the best
+        # thing is to split apart tokenize.StringPrefix.
+
+        # Make sure StringPrefix begins and ends in parens.  We're
+        # assuming it's of the form "(a|b|ab)", if a, b, and cd are
+        # valid string prefixes.
+        self.assertEqual(tokenize.StringPrefix[0], '(')
+        self.assertEqual(tokenize.StringPrefix[-1], ')')
+
+        # Then split apart everything else by '|'.
+        defined_prefixes = set(tokenize.StringPrefix[1:-1].split('|'))
+
+        # Now compute the actual allowed string prefixes and compare
+        # to what is defined in the tokenize module.
+        self.assertEqual(defined_prefixes, self.determine_valid_prefixes())
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index d72968e4250..1f31258ce36 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -24,10 +24,7 @@
 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
                'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
                'Michael Foord')
-try:
-    from builtins import open as _builtin_open
-except ImportError:
-    pass
+from builtins import open as _builtin_open
 from codecs import lookup, BOM_UTF8
 import collections
 import functools
@@ -37,13 +34,14 @@
 import sys
 from token import *
 from token import EXACT_TOKEN_TYPES
+import _tokenize
 
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
+cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
 
 import token
 __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
-                           "untokenize", "TokenInfo"]
+                           "untokenize", "TokenInfo", "open", "TokenError"]
 del token
 
 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
@@ -88,7 +86,7 @@ def _all_string_prefixes():
     # The valid string prefixes. Only contain the lower case versions,
     #  and don't contain any permutations (include 'fr', but not
     #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
+    _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
     # if we add binary f-strings, add: ['fb', 'fbr']
     result = {''}
     for prefix in _valid_string_prefixes:
@@ -134,7 +132,7 @@ def _compile(expr):
                 group("'", r'\\\r?\n'),
                 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                 group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
+PseudoExtras = group(r'\\\r?\n|\z', Comment, Triple)
 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 
 # For a given string prefix plus quotes, endpats maps it to a regex
@@ -146,6 +144,7 @@ def _compile(expr):
     endpats[_prefix + '"'] = Double
     endpats[_prefix + "'''"] = Single3
     endpats[_prefix + '"""'] = Double3
+del _prefix
 
 # A set of all of the single and triple quoted string prefixes,
 #  including the opening quotes.
@@ -156,13 +155,12 @@ def _compile(expr):
         single_quoted.add(u)
     for u in (t + '"""', t + "'''"):
         triple_quoted.add(u)
+del t, u
 
 tabsize = 8
 
 class TokenError(Exception): pass
 
-class StopTokenizing(Exception): pass
-
 
 class Untokenizer:
 
@@ -170,6 +168,8 @@ def __init__(self):
         self.tokens = []
         self.prev_row = 1
         self.prev_col = 0
+        self.prev_type = None
+        self.prev_line = ""
         self.encoding = None
 
     def add_whitespace(self, start):
@@ -177,14 +177,51 @@ def add_whitespace(self, start):
         if row < self.prev_row or row == self.prev_row and col < self.prev_col:
             raise ValueError("start ({},{}) precedes previous end ({},{})"
                              .format(row, col, self.prev_row, self.prev_col))
-        row_offset = row - self.prev_row
-        if row_offset:
-            self.tokens.append("\\\n" * row_offset)
-            self.prev_col = 0
+        self.add_backslash_continuation(start)
         col_offset = col - self.prev_col
         if col_offset:
             self.tokens.append(" " * col_offset)
 
+    def add_backslash_continuation(self, start):
+        """Add backslash continuation characters if the row has increased
+        without encountering a newline token.
+
+        This also inserts the correct amount of whitespace before the backslash.
+        """
+        row = start[0]
+        row_offset = row - self.prev_row
+        if row_offset == 0:
+            return
+
+        newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
+        line = self.prev_line.rstrip('\\\r\n')
+        ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
+        self.tokens.append(ws + f"\\{newline}" * row_offset)
+        self.prev_col = 0
+
+    def escape_brackets(self, token):
+        characters = []
+        consume_until_next_bracket = False
+        for character in token:
+            if character == "}":
+                if consume_until_next_bracket:
+                    consume_until_next_bracket = False
+                else:
+                    characters.append(character)
+            if character == "{":
+                n_backslashes = sum(
+                    1 for char in _itertools.takewhile(
+                        "\\".__eq__,
+                        characters[-2::-1]
+                    )
+                )
+                if n_backslashes % 2 == 0 or characters[-1] != "N":
+                    characters.append(character)
+                else:
+                    consume_until_next_bracket = True
+            characters.append(character)
+        return "".join(characters)
+
     def untokenize(self, iterable):
         it = iter(iterable)
         indents = []
@@ -214,12 +251,22 @@ def untokenize(self, iterable):
                     self.tokens.append(indent)
                     self.prev_col = len(indent)
                 startline = False
+            elif tok_type in {FSTRING_MIDDLE, TSTRING_MIDDLE}:
+                if '{' in token or '}' in token:
+                    token = self.escape_brackets(token)
+                    last_line = token.splitlines()[-1]
+                    end_line, end_col = end
+                    extra_chars = last_line.count("{{") + last_line.count("}}")
+                    end = (end_line, end_col + extra_chars)
+
             self.add_whitespace(start)
             self.tokens.append(token)
             self.prev_row, self.prev_col = end
             if tok_type in (NEWLINE, NL):
                 self.prev_row += 1
                 self.prev_col = 0
+            self.prev_type = tok_type
+            self.prev_line = line
         return "".join(self.tokens)
 
     def compat(self, token, iterable):
@@ -227,6 +274,7 @@ def compat(self, token, iterable):
         toks_append = self.tokens.append
         startline = token[0] in (NEWLINE, NL)
         prevstring = False
+        in_fstring_or_tstring = 0
 
         for tok in _itertools.chain([token], iterable):
             toknum, tokval = tok[:2]
@@ -245,6 +293,10 @@ def compat(self, token, iterable):
             else:
                 prevstring = False
 
+            if toknum in {FSTRING_START, TSTRING_START}:
+                in_fstring_or_tstring += 1
+            elif toknum in {FSTRING_END, TSTRING_END}:
+                in_fstring_or_tstring -= 1
             if toknum == INDENT:
                 indents.append(tokval)
                 continue
@@ -256,7 +308,19 @@ def compat(self, token, iterable):
             elif startline and indents:
                 toks_append(indents[-1])
                 startline = False
+            elif toknum in {FSTRING_MIDDLE, TSTRING_MIDDLE}:
+                tokval = self.escape_brackets(tokval)
+
+            # Insert a space between two consecutive brackets if we are in an f-string or t-string
+            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring_or_tstring:
+                tokval = ' ' + tokval
+
+            # Insert a space between two consecutive f-strings
+            if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
+                self.tokens.append(" ")
+
             toks_append(tokval)
+            self.prev_type = toknum
 
 
 def untokenize(iterable):
@@ -268,16 +332,10 @@ def untokenize(iterable):
     with at least two elements, a token number and token value.  If
     only two tokens are passed, the resulting output is poor.
 
-    Round-trip invariant for full input:
-        Untokenized source will match input source exactly
-
-    Round-trip invariant for limited input:
-        # Output bytes will tokenize back to the input
-        t1 = [tok[:2] for tok in tokenize(f.readline)]
-        newcode = untokenize(t1)
-        readline = BytesIO(newcode).readline
-        t2 = [tok[:2] for tok in tokenize(readline)]
-        assert t1 == t2
+    The result is guaranteed to tokenize back to match the input so
+    that the conversion is lossless and round-trips are assured.
+    The guarantee applies only to the token type and token string as
+    the spacing between tokens (column positions) may change.
     """
     ut = Untokenizer()
     out = ut.untokenize(iterable)
@@ -287,7 +345,7 @@ def untokenize(iterable):
 
 
 def _get_normal_name(orig_enc):
-    """Imitates get_normal_name in tokenizer.c."""
+    """Imitates get_normal_name in Parser/tokenizer/helpers.c."""
     # Only care about the first 12 characters.
     enc = orig_enc[:12].lower().replace("_", "-")
     if enc == "utf-8" or enc.startswith("utf-8-"):
@@ -327,22 +385,23 @@ def read_or_stop():
         except StopIteration:
             return b''
 
-    def find_cookie(line):
+    def check(line, encoding):
+        # Check if the line matches the encoding.
+        if 0 in line:
+            raise SyntaxError("source code cannot contain null bytes")
         try:
-            # Decode as UTF-8. Either the line is an encoding declaration,
-            # in which case it should be pure ASCII, or it must be UTF-8
-            # per default encoding.
-            line_string = line.decode('utf-8')
+            line.decode(encoding)
         except UnicodeDecodeError:
             msg = "invalid or missing encoding declaration"
             if filename is not None:
                 msg = '{} for {!r}'.format(msg, filename)
             raise SyntaxError(msg)
 
-        match = cookie_re.match(line_string)
+    def find_cookie(line):
+        match = cookie_re.match(line)
         if not match:
             return None
-        encoding = _get_normal_name(match.group(1))
+        encoding = _get_normal_name(match.group(1).decode())
         try:
             codec = lookup(encoding)
         except LookupError:
@@ -375,18 +434,23 @@ def find_cookie(line):
 
     encoding = find_cookie(first)
     if encoding:
+        check(first, encoding)
         return encoding, [first]
     if not blank_re.match(first):
+        check(first, default)
         return default, [first]
 
     second = read_or_stop()
     if not second:
+        check(first, default)
         return default, [first]
 
     encoding = find_cookie(second)
     if encoding:
+        check(first + second, encoding)
         return encoding, [first, second]
 
+    check(first + second, default)
     return default, [first, second]
 
 
@@ -405,7 +469,6 @@ def open(filename):
         buffer.close()
         raise
 
-
 def tokenize(readline):
     """
     The tokenize() generator requires one argument, readline, which
@@ -426,193 +489,13 @@ def tokenize(readline):
     which tells you which encoding was used to decode the bytes stream.
     """
     encoding, consumed = detect_encoding(readline)
-    empty = _itertools.repeat(b"")
-    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
-    return _tokenize(rl_gen.__next__, encoding)
-
-
-def _tokenize(readline, encoding):
-    lnum = parenlev = continued = 0
-    numchars = '0123456789'
-    contstr, needcont = '', 0
-    contline = None
-    indents = [0]
-
+    rl_gen = _itertools.chain(consumed, iter(readline, b""))
     if encoding is not None:
         if encoding == "utf-8-sig":
             # BOM will already have been stripped.
             encoding = "utf-8"
         yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    last_line = b''
-    line = b''
-    while True:                                # loop over lines in stream
-        try:
-            # We capture the value of the line variable here because
-            # readline uses the empty string '' to signal end of input,
-            # hence `line` itself will always be overwritten at the end
-            # of this loop.
-            last_line = line
-            line = readline()
-        except StopIteration:
-            line = b''
-
-        if encoding is not None:
-            line = line.decode(encoding)
-        lnum += 1
-        pos, max = 0, len(line)
-
-        if contstr:                            # continued string
-            if not line:
-                raise TokenError("EOF in multi-line string", strstart)
-            endmatch = endprog.match(line)
-            if endmatch:
-                pos = end = endmatch.end(0)
-                yield TokenInfo(STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
-                contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield TokenInfo(ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
-                contline = None
-                continue
-            else:
-                contstr = contstr + line
-                contline = contline + line
-                continue
-
-        elif parenlev == 0 and not continued:  # new statement
-            if not line: break
-            column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ':
-                    column += 1
-                elif line[pos] == '\t':
-                    column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f':
-                    column = 0
-                else:
-                    break
-                pos += 1
-            if pos == max:
-                break
-
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    yield TokenInfo(COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    pos += len(comment_token)
-
-                yield TokenInfo(NL, line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
-                continue
-
-            if column > indents[-1]:           # count indents or dedents
-                indents.append(column)
-                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
-                if column not in indents:
-                    raise IndentationError(
-                        "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
-                indents = indents[:-1]
-
-                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
-        else:                                  # continued statement
-            if not line:
-                raise TokenError("EOF in multi-line statement", (lnum, 0))
-            continued = 0
-
-        while pos < max:
-            pseudomatch = _compile(PseudoToken).match(line, pos)
-            if pseudomatch:                                # scan for tokens
-                start, end = pseudomatch.span(1)
-                spos, epos, pos = (lnum, start), (lnum, end), end
-                if start == end:
-                    continue
-                token, initial = line[start:end], line[start]
-
-                if (initial in numchars or                 # ordinary number
-                    (initial == '.' and token != '.' and token != '...')):
-                    yield TokenInfo(NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
-                    if parenlev > 0:
-                        yield TokenInfo(NL, token, spos, epos, line)
-                    else:
-                        yield TokenInfo(NEWLINE, token, spos, epos, line)
-
-                elif initial == '#':
-                    assert not token.endswith("\n")
-                    yield TokenInfo(COMMENT, token, spos, epos, line)
-
-                elif token in triple_quoted:
-                    endprog = _compile(endpats[token])
-                    endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
-                        pos = endmatch.end(0)
-                        token = line[start:pos]
-                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
-                    else:
-                        strstart = (lnum, start)           # multiple lines
-                        contstr = line[start:]
-                        contline = line
-                        break
-
-                # Check up to the first 3 chars of the token to see if
-                #  they're in the single_quoted set. If so, they start
-                #  a string.
-                # We're using the first 3, because we're looking for
-                #  "rb'" (for example) at the start of the token. If
-                #  we switch to longer prefixes, this needs to be
-                #  adjusted.
-                # Note that initial == token[:1].
-                # Also note that single quote checking must come after
-                #  triple quote checking (above).
-                elif (initial in single_quoted or
-                      token[:2] in single_quoted or
-                      token[:3] in single_quoted):
-                    if token[-1] == '\n':                  # continued string
-                        strstart = (lnum, start)
-                        # Again, using the first 3 chars of the
-                        #  token. This is looking for the matching end
-                        #  regex for the correct type of quote
-                        #  character. So it's really looking for
-                        #  endpats["'"] or endpats['"'], by trying to
-                        #  skip string prefix characters, if any.
-                        endprog = _compile(endpats.get(initial) or
-                                           endpats.get(token[1]) or
-                                           endpats.get(token[2]))
-                        contstr, needcont = line[start:], 1
-                        contline = line
-                        break
-                    else:                                  # ordinary string
-                        yield TokenInfo(STRING, token, spos, epos, line)
-
-                elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
-                elif initial == '\\':                      # continued stmt
-                    continued = 1
-                else:
-                    if initial in '([{':
-                        parenlev += 1
-                    elif initial in ')]}':
-                        parenlev -= 1
-                    yield TokenInfo(OP, token, spos, epos, line)
-            else:
-                yield TokenInfo(ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
-                pos += 1
-
-    # Add an implicit NEWLINE if the input doesn't end in one
-    if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
-        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
-
+    yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
 
 def generate_tokens(readline):
     """Tokenize a source reading Python code as unicode strings.
@@ -620,9 +503,9 @@ def generate_tokens(readline):
     This has the same API as tokenize(), except that it expects the *readline*
     callable to return str objects instead of bytes.
     """
-    return _tokenize(readline, None)
+    return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
 
-def main():
+def _main(args=None):
     import argparse
 
     # Helper error handling routines
@@ -641,13 +524,13 @@ def error(message, filename=None, location=None):
         sys.exit(1)
 
     # Parse the arguments and options
-    parser = argparse.ArgumentParser(prog='python -m tokenize')
+    parser = argparse.ArgumentParser(color=True)
     parser.add_argument(dest='filename', nargs='?',
                         metavar='filename.py',
                         help='the file to tokenize; defaults to stdin')
     parser.add_argument('-e', '--exact', dest='exact', action='store_true',
                         help='display token names using the exact type')
-    args = parser.parse_args()
+    args = parser.parse_args(args)
 
     try:
         # Tokenize the input
@@ -657,7 +540,9 @@ def error(message, filename=None, location=None):
                 tokens = list(tokenize(f.readline))
         else:
             filename = "<stdin>"
-            tokens = _tokenize(sys.stdin.readline, None)
+            tokens = _generate_tokens_from_c_tokenizer(
+                sys.stdin.readline, extra_tokens=True)
+
 
         # Output the tokenization
         for token in tokens:
@@ -683,5 +568,31 @@ def error(message, filename=None, location=None):
         perror("unexpected error: %s" % err)
         raise
 
+def _transform_msg(msg):
+    """Transform error messages from the C tokenizer into the Python tokenize
+
+    The C tokenizer is more picky than the Python one, so we need to massage
+    the error messages a bit for backwards compatibility.
+    """
+    if "unterminated triple-quoted string literal" in msg:
+        return "EOF in multi-line string"
+    return msg
+
+def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
+    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
+    if encoding is None:
+        it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
+    else:
+        it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
+    try:
+        for info in it:
+            yield TokenInfo._make(info)
+    except SyntaxError as e:
+        if type(e) != SyntaxError:
+            raise e from None
+        msg = _transform_msg(e.msg)
+        raise TokenError(msg, (e.lineno, e.offset)) from None
+
+
 if __name__ == "__main__":
-    main()
+    _main()

From 55737ede635ba35bcc60e61420bb3a5a0b5295ec Mon Sep 17 00:00:00 2001
From: "Jeong, YunWon" <jeong@youknowone.org>
Date: Mon, 9 Mar 2026 11:55:31 +0900
Subject: [PATCH 3/3] Rewrite _tokenize with 2-phase model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace per-line reparsing with single-pass tokenization:
- Read all lines via readline, parse once, yield tokens
- Fix token type values (COMMENT=65, NL=66, OP=55)
- Fix NEWLINE/NL end positions and implicit newline handling
- Fix DEDENT positions via look-ahead to next non-DEDENT token
- Handle FSTRING_MIDDLE brace unescaping ({{ → {, }} → })
- Emit implicit NL before ENDMARKER when source lacks trailing newline
- Raise IndentationError from lexer errors
- Remove 13 expectedFailure marks for now-passing tests
---
 Lib/test/test_tabnanny.py      |   1 -
 Lib/test/test_tokenize.py      |  13 -
 crates/stdlib/src/_tokenize.rs | 747 +++++++++++++++++++++++++++++++++
 crates/stdlib/src/lib.rs       |   5 +-
 crates/stdlib/src/tokenize.rs  | 391 -----------------
 5 files changed, 750 insertions(+), 407 deletions(-)
 create mode 100644 crates/stdlib/src/_tokenize.rs
 delete mode 100644 crates/stdlib/src/tokenize.rs

diff --git a/Lib/test/test_tabnanny.py b/Lib/test/test_tabnanny.py
index 372be9eb8c3..d7a77eb26e4 100644
--- a/Lib/test/test_tabnanny.py
+++ b/Lib/test/test_tabnanny.py
@@ -316,7 +316,6 @@ def validate_cmd(self, *args, stdout="", stderr="", partial=False, expect_failur
             self.assertListEqual(out.splitlines(), stdout.splitlines())
             self.assertListEqual(err.splitlines(), stderr.splitlines())
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; Should displays error when errored python file is given.
     def test_with_errored_file(self):
         """Should displays error when errored python file is given."""
         with TemporaryPyFile(SOURCE_CODES["wrong_indented"]) as file_path:
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py
index c10f80a723c..394a87c3601 100644
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1167,7 +1167,6 @@ async def bar(): pass
     DEDENT     ''            (7, 0) (7, 0)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; +  "    NEWLINE    '\\n'          (4, 1) (4, 2)"]
     def test_newline_after_parenthesized_block_with_comment(self):
         self.check_tokenize('''\
 [
@@ -1192,7 +1191,6 @@ def test_closing_parenthesis_from_different_line(self):
     NAME       'x'           (1, 3) (1, 4)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; '    FSTRING_END "\'\'\'"         (2, 68) (2, 71)']
     def test_multiline_non_ascii_fstring(self):
         self.check_tokenize("""\
 a = f'''
@@ -1204,7 +1202,6 @@ def test_multiline_non_ascii_fstring(self):
     FSTRING_END "\'\'\'"         (2, 68) (2, 71)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; Diff is 696 characters long. Set self.maxDiff to None to see it.
     def test_multiline_non_ascii_fstring_with_expr(self):
         self.check_tokenize("""\
 f'''
@@ -2176,7 +2173,6 @@ def test_string_concatenation(self):
         # Two string literals on the same line
         self.check_roundtrip("'' ''")
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_random_files(self):
         # Test roundtrip on random python modules.
         # pass the '-ucpu' option to process the full directory.
@@ -2214,7 +2210,6 @@ def test_indentation_semantics_retained(self):
 
 
 class InvalidPythonTests(TestCase):
-    @unittest.expectedFailure  # TODO: RUSTPYTHON; Diff is 1046 characters long. Set self.maxDiff to None to see it.
     def test_number_followed_by_name(self):
         # See issue #gh-105549
         source = "2sin(x)"
@@ -2254,7 +2249,6 @@ def check_tokenize(self, s, expected):
             )
             self.assertEqual(result, expected.rstrip().splitlines())
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_encoding(self):
         def readline(encoding):
             yield "1+1".encode(encoding)
@@ -2386,7 +2380,6 @@ def test_float(self):
     NUMBER     '3.14e159'    (1, 4) (1, 12)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_string(self):
 
         self.check_tokenize('x = \'\'; y = ""', """\
@@ -2818,7 +2811,6 @@ def test_unary(self):
     NUMBER     '1'           (1, 22) (1, 23)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_selector(self):
 
         self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
@@ -2841,7 +2833,6 @@ def test_selector(self):
     RPAR       ')'           (2, 29) (2, 30)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_method(self):
 
         self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
@@ -2859,7 +2850,6 @@ def test_method(self):
     NAME       'pass'        (2, 14) (2, 18)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_tabs(self):
 
         self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
@@ -3144,7 +3134,6 @@ async def bar(): pass
     DEDENT     ''            (6, -1) (6, -1)
     """)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_unicode(self):
 
         self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
@@ -3394,7 +3383,6 @@ def f():
             with contextlib.redirect_stderr(StringIO()):
                 _ = self.invoke_tokenize('--unknown')
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_without_flag(self):
         # test 'python -m tokenize source.py'
         source = 'a = 1'
@@ -3408,7 +3396,6 @@ def test_without_flag(self):
         '''
         self.check_output(source, expect)
 
-    @unittest.expectedFailure  # TODO: RUSTPYTHON
     def test_exact_flag(self):
         # test 'python -m tokenize -e/--exact source.py'
         source = 'a = 1'
diff --git a/crates/stdlib/src/_tokenize.rs b/crates/stdlib/src/_tokenize.rs
new file mode 100644
index 00000000000..13e40ff12b0
--- /dev/null
+++ b/crates/stdlib/src/_tokenize.rs
@@ -0,0 +1,747 @@
+pub(crate) use _tokenize::module_def;
+
+#[pymodule]
+mod _tokenize {
+    use crate::{
+        common::lock::PyRwLock,
+        vm::{
+            AsObject, Py, PyObjectRef, PyPayload, PyResult, VirtualMachine,
+            builtins::{PyBytes, PyStr, PyType},
+            convert::ToPyObject,
+            function::ArgCallable,
+            protocol::PyIterReturn,
+            types::{Constructor, IterNext, Iterable, SelfIter},
+        },
+    };
+    use ruff_python_ast::PySourceType;
+    use ruff_python_ast::token::{Token, TokenKind};
+    use ruff_python_parser::{
+        LexicalErrorType, ParseError, ParseErrorType, parse_unchecked_source,
+    };
+    use ruff_source_file::{LineIndex, LineRanges};
+    use ruff_text_size::{Ranged, TextSize};
+    use core::fmt;
+
+    const TOKEN_ENDMARKER: u8 = 0;
+    const TOKEN_DEDENT: u8 = 6;
+    const TOKEN_OP: u8 = 55;
+    const TOKEN_COMMENT: u8 = 65;
+    const TOKEN_NL: u8 = 66;
+
+    #[pyattr]
+    #[pyclass(name = "TokenizerIter")]
+    #[derive(PyPayload)]
+    pub struct PyTokenizerIter {
+        readline: ArgCallable,
+        extra_tokens: bool,
+        encoding: Option<String>,
+        state: PyRwLock<TokenizerState>,
+    }
+
+    impl PyTokenizerIter {
+        fn readline(&self, vm: &VirtualMachine) -> PyResult<String> {
+            let raw_line = match self.readline.invoke((), vm) {
+                Ok(v) => v,
+                Err(err) => {
+                    if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) {
+                        return Ok(String::new());
+                    }
+                    return Err(err);
+                }
+            };
+            Ok(match &self.encoding {
+                Some(encoding) => {
+                    let bytes = raw_line
+                        .downcast::<PyBytes>()
+                        .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?;
+                    vm.state
+                        .codec_registry
+                        .decode_text(bytes.into(), encoding, None, vm)
+                        .map(|s| s.to_string())?
+                }
+                None => raw_line
+                    .downcast::<PyStr>()
+                    .map(|s| s.to_string())
+                    .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?,
+            })
+        }
+    }
+
+    impl fmt::Debug for PyTokenizerIter {
+        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+            f.debug_struct("PyTokenizerIter")
+                .field("extra_tokens", &self.extra_tokens)
+                .field("encoding", &self.encoding)
+                .finish()
+        }
+    }
+
+    #[pyclass(with(Constructor, Iterable, IterNext))]
+    impl PyTokenizerIter {}
+
+    impl Constructor for PyTokenizerIter {
+        type Args = PyTokenizerIterArgs;
+
+        fn py_new(_cls: &Py<PyType>, args: Self::Args, _vm: &VirtualMachine) -> PyResult<Self> {
+            let Self::Args {
+                readline,
+                extra_tokens,
+                encoding,
+            } = args;
+
+            Ok(Self {
+                readline,
+                extra_tokens,
+                encoding: encoding.map(|s| s.to_string()),
+                state: PyRwLock::new(TokenizerState {
+                    phase: TokenizerPhase::Reading {
+                        source: String::new(),
+                    },
+                }),
+            })
+        }
+    }
+
+    impl SelfIter for PyTokenizerIter {}
+
+    impl IterNext for PyTokenizerIter {
+        fn next(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<PyIterReturn> {
+            let mut state = zelf.state.read().clone();
+
+            loop {
+                match &mut state.phase {
+                    TokenizerPhase::Reading { source } => {
+                        let line = zelf.readline(vm)?;
+                        if line.is_empty() {
+                            let accumulated = core::mem::take(source);
+                            let parsed =
+                                parse_unchecked_source(&accumulated, PySourceType::Python);
+                            let tokens: Vec<Token> = parsed.tokens().iter().copied().collect();
+                            let errors: Vec<ParseError> = parsed.errors().to_vec();
+                            let line_index = LineIndex::from_source_text(&accumulated);
+                            let implicit_nl = !accumulated.ends_with('\n');
+                            state.phase = TokenizerPhase::Yielding {
+                                source: accumulated,
+                                tokens,
+                                errors,
+                                index: 0,
+                                line_index,
+                                need_implicit_nl: implicit_nl,
+                                pending_fstring_parts: Vec::new(),
+                                pending_empty_fstring_middle: None,
+                            };
+                        } else {
+                            source.push_str(&line);
+                        }
+                    }
+                    TokenizerPhase::Yielding { .. } => {
+                        let result =
+                            emit_next_token(&mut state, zelf.extra_tokens, vm)?;
+                        *zelf.state.write() = state;
+                        return Ok(result);
+                    }
+                    TokenizerPhase::Done => {
+                        return Ok(PyIterReturn::StopIteration(None));
+                    }
+                }
+            }
+        }
+    }
+
+    /// Emit the next token from the Yielding phase.
+    fn emit_next_token(
+        state: &mut TokenizerState,
+        extra_tokens: bool,
+        vm: &VirtualMachine,
+    ) -> PyResult<PyIterReturn> {
+        let TokenizerPhase::Yielding {
+            source,
+            tokens,
+            errors,
+            index,
+            line_index,
+            need_implicit_nl,
+            pending_fstring_parts,
+            pending_empty_fstring_middle,
+        } = &mut state.phase
+        else {
+            unreachable!()
+        };
+
+        // Emit pending empty FSTRING_MIDDLE (for format spec nesting)
+        if let Some((mid_type, mid_line, mid_col, mid_line_str)) =
+            pending_empty_fstring_middle.take()
+        {
+            return Ok(PyIterReturn::Return(make_token_tuple(
+                vm,
+                mid_type,
+                "",
+                mid_line,
+                mid_col as isize,
+                mid_line,
+                mid_col as isize,
+                &mid_line_str,
+            )));
+        }
+
+        // Emit any pending fstring sub-tokens first
+        if let Some((tok_type, tok_str, sl, sc, el, ec)) = pending_fstring_parts.pop() {
+            let offset: usize = source
+                .lines()
+                .take(sl.saturating_sub(1))
+                .map(|l| l.len() + 1)
+                .sum();
+            let full_line =
+                source.full_line_str(TextSize::from(offset.min(source.len()) as u32));
+            return Ok(PyIterReturn::Return(make_token_tuple(
+                vm, tok_type, &tok_str, sl, sc as isize, el, ec as isize, full_line,
+            )));
+        }
+
+        let source_len = TextSize::from(source.len() as u32);
+
+        while *index < tokens.len() {
+            let token = tokens[*index];
+            *index += 1;
+            let kind = token.kind();
+            let range = token.range();
+
+            // Check for lexical indentation errors.
+            // Skip when source has tabs — ruff and CPython handle tab
+            // indentation differently (CPython uses tabsize=8), so ruff may
+            // report false IndentationErrors for valid mixed-tab code.
+            if !source.contains('\t') {
+                for err in errors.iter() {
+                    if !matches!(
+                        err.error,
+                        ParseErrorType::Lexical(LexicalErrorType::IndentationError)
+                    ) {
+                        continue;
+                    }
+                    if err.location.start() <= range.start()
+                        && range.start() < err.location.end()
+                    {
+                        return Err(raise_indentation_error(vm, err, source, line_index));
+                    }
+                }
+            }
+
+            if kind == TokenKind::EndOfFile {
+                continue;
+            }
+
+            if !extra_tokens
+                && matches!(kind, TokenKind::Comment | TokenKind::NonLogicalNewline)
+            {
+                continue;
+            }
+
+            let raw_type = token_kind_value(kind);
+            let token_type = if extra_tokens && raw_type > TOKEN_DEDENT && raw_type < TOKEN_OP
+            {
+                TOKEN_OP
+            } else {
+                raw_type
+            };
+
+            let (token_str, start_line, start_col, end_line, end_col, line_str) =
+                if kind == TokenKind::Dedent {
+                    let last_line = source.lines().count();
+                    let default_pos = if extra_tokens {
+                        (last_line + 1, 0)
+                    } else {
+                        (last_line, 0)
+                    };
+                    let (pos, dedent_line) =
+                        next_non_dedent_info(tokens, *index, source, line_index, default_pos);
+                    ("", pos.0, pos.1, pos.0, pos.1, dedent_line)
+                } else {
+                    let start_lc = line_index.line_column(range.start(), source);
+                    let start_line = start_lc.line.get();
+                    let start_col = start_lc.column.to_zero_indexed();
+                    let implicit_newline = range.start() >= source_len;
+                    let in_source = range.end() <= source_len;
+
+                    let (s, el, ec) = if kind == TokenKind::Newline {
+                        if extra_tokens {
+                            if implicit_newline {
+                                ("", start_line, start_col + 1)
+                            } else {
+                                let s = if source[range].starts_with('\r') {
+                                    "\r\n"
+                                } else {
+                                    "\n"
+                                };
+                                (s, start_line, start_col + s.len())
+                            }
+                        } else {
+                            ("", start_line, start_col)
+                        }
+                    } else if kind == TokenKind::NonLogicalNewline {
+                        let s = if in_source { &source[range] } else { "" };
+                        (s, start_line, start_col + s.len())
+                    } else {
+                        let end_lc = line_index.line_column(range.end(), source);
+                        let s = if in_source { &source[range] } else { "" };
+                        (s, end_lc.line.get(), end_lc.column.to_zero_indexed())
+                    };
+                    let line_str = source.full_line_str(range.start());
+                    (s, start_line, start_col, el, ec, line_str)
+                };
+
+            // Handle FSTRING_MIDDLE/TSTRING_MIDDLE brace unescaping
+            if matches!(kind, TokenKind::FStringMiddle | TokenKind::TStringMiddle)
+                && (token_str.contains("{{") || token_str.contains("}}"))
+            {
+                let mut parts =
+                    split_fstring_middle(token_str, token_type, start_line, start_col)
+                        .into_iter();
+                let (tt, ts, sl, sc, el, ec) = parts.next().unwrap();
+                let rest: Vec<_> = parts.collect();
+                for p in rest.into_iter().rev() {
+                    pending_fstring_parts.push(p);
+                }
+                return Ok(PyIterReturn::Return(make_token_tuple(
+                    vm, tt, &ts, sl, sc as isize, el, ec as isize, line_str,
+                )));
+            }
+
+            // After emitting a Rbrace inside an fstring, check if the
+            // next token is also Rbrace without an intervening FStringMiddle.
+            // CPython emits an empty FSTRING_MIDDLE in that position.
+            if kind == TokenKind::Rbrace
+                && tokens
+                    .get(*index)
+                    .is_some_and(|t| t.kind() == TokenKind::Rbrace)
+            {
+                let mid_type = find_fstring_middle_type(tokens, *index);
+                *pending_empty_fstring_middle = Some((
+                    mid_type,
+                    end_line,
+                    end_col,
+                    line_str.to_string(),
+                ));
+            }
+
+            return Ok(PyIterReturn::Return(make_token_tuple(
+                vm, token_type, token_str, start_line, start_col as isize, end_line,
+                end_col as isize, line_str,
+            )));
+        }
+
+        // Emit implicit NL before ENDMARKER if source
+        // doesn't end with newline and last token is Comment
+        if extra_tokens && core::mem::take(need_implicit_nl) {
+            let last_tok = tokens
+                .iter()
+                .rev()
+                .find(|t| t.kind() != TokenKind::EndOfFile);
+            if let Some(last) = last_tok.filter(|t| t.kind() == TokenKind::Comment) {
+                let end_lc = line_index.line_column(last.range().end(), source);
+                let nl_line = end_lc.line.get();
+                let nl_col = end_lc.column.to_zero_indexed();
+                return Ok(PyIterReturn::Return(make_token_tuple(
+                    vm,
+                    TOKEN_NL,
+                    "",
+                    nl_line,
+                    nl_col as isize,
+                    nl_line,
+                    nl_col as isize + 1,
+                    source.full_line_str(last.range().start()),
+                )));
+            }
+        }
+
+        // Check for unclosed brackets before ENDMARKER — CPython's tokenizer
+        // raises SyntaxError("EOF in multi-line statement") in this case.
+        {
+            let bracket_count: i32 = tokens
+                .iter()
+                .map(|t| match t.kind() {
+                    TokenKind::Lpar | TokenKind::Lsqb | TokenKind::Lbrace => 1,
+                    TokenKind::Rpar | TokenKind::Rsqb | TokenKind::Rbrace => -1,
+                    _ => 0,
+                })
+                .sum();
+            if bracket_count > 0 {
+                let last_line = source.lines().count();
+                return Err(raise_syntax_error(
+                    vm,
+                    "EOF in multi-line statement",
+                    last_line + 1,
+                    0,
+                ));
+            }
+        }
+
+        // All tokens consumed — emit ENDMARKER
+        let last_line = source.lines().count();
+        let (em_line, em_col, em_line_str): (usize, isize, &str) = if extra_tokens {
+            (last_line + 1, 0, "")
+        } else {
+            let last_line_text = source.full_line_str(TextSize::from(
+                source.len().saturating_sub(1) as u32,
+            ));
+            (last_line, -1, last_line_text)
+        };
+
+        let result = make_token_tuple(
+            vm, TOKEN_ENDMARKER, "", em_line, em_col, em_line, em_col, em_line_str,
+        );
+        state.phase = TokenizerPhase::Done;
+        Ok(PyIterReturn::Return(result))
+    }
+
+    /// Determine whether to emit FSTRING_MIDDLE (60) or TSTRING_MIDDLE (63)
+    /// by looking back for the most recent FStringStart/TStringStart.
+    fn find_fstring_middle_type(tokens: &[Token], index: usize) -> u8 {
+        let mut depth = 0i32;
+        for i in (0..index).rev() {
+            match tokens[i].kind() {
+                TokenKind::FStringEnd | TokenKind::TStringEnd => depth += 1,
+                TokenKind::FStringStart => {
+                    if depth == 0 {
+                        return 60; // FSTRING_MIDDLE
+                    }
+                    depth -= 1;
+                }
+                TokenKind::TStringStart => {
+                    if depth == 0 {
+                        return 63; // TSTRING_MIDDLE
+                    }
+                    depth -= 1;
+                }
+                _ => {}
+            }
+        }
+        60 // default to FSTRING_MIDDLE
+    }
+
+    /// Find the next non-DEDENT token's position and source line.
+    /// Returns ((line, col), line_str).
+    fn next_non_dedent_info<'a>(
+        tokens: &[Token],
+        index: usize,
+        source: &'a str,
+        line_index: &LineIndex,
+        default_pos: (usize, usize),
+    ) -> ((usize, usize), &'a str) {
+        for future in &tokens[index..] {
+            match future.kind() {
+                TokenKind::Dedent => continue,
+                TokenKind::EndOfFile => return (default_pos, ""),
+                _ => {
+                    let flc = line_index.line_column(future.range().start(), source);
+                    let pos = (flc.line.get(), flc.column.to_zero_indexed());
+                    return (pos, source.full_line_str(future.range().start()));
+                }
+            }
+        }
+        (default_pos, "")
+    }
+
+    /// Raise a SyntaxError with the given message and position.
+    fn raise_syntax_error(
+        vm: &VirtualMachine,
+        msg: &str,
+        lineno: usize,
+        offset: usize,
+    ) -> rustpython_vm::builtins::PyBaseExceptionRef {
+        let exc = vm.new_exception_msg(
+            vm.ctx.exceptions.syntax_error.to_owned(),
+            msg.into(),
+        );
+        let obj = exc.as_object();
+        let _ = obj.set_attr("msg", vm.ctx.new_str(msg), vm);
+        let _ = obj.set_attr("lineno", vm.ctx.new_int(lineno), vm);
+        let _ = obj.set_attr("offset", vm.ctx.new_int(offset), vm);
+        let _ = obj.set_attr("filename", vm.ctx.new_str("<string>"), vm);
+        let _ = obj.set_attr("text", vm.ctx.none(), vm);
+        exc
+    }
+
+    /// Raise an IndentationError from a parse error.
+    fn raise_indentation_error(
+        vm: &VirtualMachine,
+        err: &ParseError,
+        source: &str,
+        line_index: &LineIndex,
+    ) -> rustpython_vm::builtins::PyBaseExceptionRef {
+        let err_lc = line_index.line_column(err.location.start(), source);
+        let err_line_text = source.full_line_str(err.location.start());
+        let err_text = err_line_text.trim_end_matches('\n').trim_end_matches('\r');
+        let msg = format!("{}", err.error);
+        let exc = vm.new_exception_msg(
+            vm.ctx.exceptions.indentation_error.to_owned(),
+            msg.clone().into(),
+        );
+        let obj = exc.as_object();
+        let _ = obj.set_attr("lineno", vm.ctx.new_int(err_lc.line.get()), vm);
+        let _ = obj.set_attr("offset", vm.ctx.new_int(err_text.len() as i64 + 1), vm);
+        let _ = obj.set_attr("msg", vm.ctx.new_str(msg), vm);
+        let _ = obj.set_attr("filename", vm.ctx.new_str("<string>"), vm);
+        let _ = obj.set_attr("text", vm.ctx.new_str(err_text), vm);
+        exc
+    }
+
+    /// Split an FSTRING_MIDDLE/TSTRING_MIDDLE token containing `{{`/`}}`
+    /// into multiple unescaped sub-tokens.
+    /// Returns vec of (type, string, start_line, start_col, end_line, end_col).
+    fn split_fstring_middle(
+        raw: &str,
+        token_type: u8,
+        start_line: usize,
+        start_col: usize,
+    ) -> Vec<(u8, String, usize, usize, usize, usize)> {
+        let mut parts = Vec::new();
+        let mut current = String::new();
+        // Track source position (line, col) — these correspond to the
+        // original source positions (with {{ and }} still doubled)
+        let mut cur_line = start_line;
+        let mut cur_col = start_col;
+        // Track the start position of the current accumulating part
+        let mut part_start_line = cur_line;
+        let mut part_start_col = cur_col;
+        let mut chars = raw.chars().peekable();
+
+        // Compute end position of the current accumulated text
+        let end_pos = |current: &str, start_line: usize, start_col: usize| -> (usize, usize) {
+            let mut el = start_line;
+            let mut ec = start_col;
+            for ch in current.chars() {
+                if ch == '\n' {
+                    el += 1;
+                    ec = 0;
+                } else {
+                    ec += ch.len_utf8();
+                }
+            }
+            (el, ec)
+        };
+
+        while let Some(ch) = chars.next() {
+            if ch == '{' && chars.peek() == Some(&'{') {
+                chars.next();
+                current.push('{');
+                cur_col += 2; // skip both {{ in source
+            } else if ch == '}' && chars.peek() == Some(&'}') {
+                chars.next();
+                // Flush accumulated text before }}
+                if !current.is_empty() {
+                    let (el, ec) = end_pos(&current, part_start_line, part_start_col);
+                    parts.push((
+                        token_type,
+                        core::mem::take(&mut current),
+                        part_start_line,
+                        part_start_col,
+                        el,
+                        ec,
+                    ));
+                }
+                // Emit unescaped '}' at source position of }}
+                parts.push((
+                    token_type,
+                    "}".to_string(),
+                    cur_line,
+                    cur_col,
+                    cur_line,
+                    cur_col + 1,
+                ));
+                cur_col += 2; // skip both }} in source
+                part_start_line = cur_line;
+                part_start_col = cur_col;
+            } else {
+                if current.is_empty() {
+                    part_start_line = cur_line;
+                    part_start_col = cur_col;
+                }
+                current.push(ch);
+                if ch == '\n' {
+                    cur_line += 1;
+                    cur_col = 0;
+                } else {
+                    cur_col += ch.len_utf8();
+                }
+            }
+        }
+
+        if !current.is_empty() {
+            let (el, ec) = end_pos(&current, part_start_line, part_start_col);
+            parts.push((token_type, current, part_start_line, part_start_col, el, ec));
+        }
+
+        parts
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn make_token_tuple(
+        vm: &VirtualMachine,
+        token_type: u8,
+        string: &str,
+        start_line: usize,
+        start_col: isize,
+        end_line: usize,
+        end_col: isize,
+        line: &str,
+    ) -> PyObjectRef {
+        vm.ctx
+            .new_tuple(vec![
+                token_type.to_pyobject(vm),
+                vm.ctx.new_str(string).into(),
+                vm.ctx
+                    .new_tuple(vec![start_line.to_pyobject(vm), start_col.to_pyobject(vm)])
+                    .into(),
+                vm.ctx
+                    .new_tuple(vec![end_line.to_pyobject(vm), end_col.to_pyobject(vm)])
+                    .into(),
+                vm.ctx.new_str(line).into(),
+            ])
+            .into()
+    }
+
+    #[derive(FromArgs)]
+    pub struct PyTokenizerIterArgs {
+        #[pyarg(positional)]
+        readline: ArgCallable,
+        #[pyarg(named)]
+        extra_tokens: bool,
+        #[pyarg(named, optional)]
+        encoding: Option<rustpython_vm::PyRef<PyStr>>,
+    }
+
+    #[derive(Clone, Debug)]
+    struct TokenizerState {
+        phase: TokenizerPhase,
+    }
+
+    #[derive(Clone, Debug)]
+    enum TokenizerPhase {
+        Reading {
+            source: String,
+        },
+        Yielding {
+            source: String,
+            tokens: Vec<Token>,
+            errors: Vec<ParseError>,
+            index: usize,
+            line_index: LineIndex,
+            need_implicit_nl: bool,
+            /// Pending sub-tokens from FSTRING_MIDDLE splitting
+            pending_fstring_parts: Vec<(u8, String, usize, usize, usize, usize)>,
+            /// Pending empty FSTRING_MIDDLE for format spec nesting:
+            /// (type, line, col, line_str)
+            pending_empty_fstring_middle: Option<(u8, usize, usize, String)>,
+        },
+        Done,
+    }
+
+    const fn token_kind_value(kind: TokenKind) -> u8 {
+        match kind {
+            TokenKind::EndOfFile => 0,
+            TokenKind::Name
+            | TokenKind::For
+            | TokenKind::In
+            | TokenKind::Pass
+            | TokenKind::Class
+            | TokenKind::And
+            | TokenKind::Is
+            | TokenKind::Raise
+            | TokenKind::True
+            | TokenKind::False
+            | TokenKind::Assert
+            | TokenKind::Try
+            | TokenKind::While
+            | TokenKind::Yield
+            | TokenKind::Lambda
+            | TokenKind::None
+            | TokenKind::Not
+            | TokenKind::Or
+            | TokenKind::Break
+            | TokenKind::Continue
+            | TokenKind::Global
+            | TokenKind::Nonlocal
+            | TokenKind::Return
+            | TokenKind::Except
+            | TokenKind::Import
+            | TokenKind::Case
+            | TokenKind::Match
+            | TokenKind::Type
+            | TokenKind::Await
+            | TokenKind::With
+            | TokenKind::Del
+            | TokenKind::Finally
+            | TokenKind::From
+            | TokenKind::Def
+            | TokenKind::If
+            | TokenKind::Else
+            | TokenKind::Elif
+            | TokenKind::As
+            | TokenKind::Async => 1,
+            TokenKind::Int | TokenKind::Complex | TokenKind::Float => 2,
+            TokenKind::String => 3,
+            TokenKind::Newline => 4,
+            TokenKind::NonLogicalNewline => TOKEN_NL,
+            TokenKind::Indent => 5,
+            TokenKind::Dedent => 6,
+            TokenKind::Lpar => 7,
+            TokenKind::Rpar => 8,
+            TokenKind::Lsqb => 9,
+            TokenKind::Rsqb => 10,
+            TokenKind::Colon => 11,
+            TokenKind::Comma => 12,
+            TokenKind::Semi => 13,
+            TokenKind::Plus => 14,
+            TokenKind::Minus => 15,
+            TokenKind::Star => 16,
+            TokenKind::Slash => 17,
+            TokenKind::Vbar => 18,
+            TokenKind::Amper => 19,
+            TokenKind::Less => 20,
+            TokenKind::Greater => 21,
+            TokenKind::Equal => 22,
+            TokenKind::Dot => 23,
+            TokenKind::Percent => 24,
+            TokenKind::Lbrace => 25,
+            TokenKind::Rbrace => 26,
+            TokenKind::EqEqual => 27,
+            TokenKind::NotEqual => 28,
+            TokenKind::LessEqual => 29,
+            TokenKind::GreaterEqual => 30,
+            TokenKind::Tilde => 31,
+            TokenKind::CircumFlex => 32,
+            TokenKind::LeftShift => 33,
+            TokenKind::RightShift => 34,
+            TokenKind::DoubleStar => 35,
+            TokenKind::PlusEqual => 36,
+            TokenKind::MinusEqual => 37,
+            TokenKind::StarEqual => 38,
+            TokenKind::SlashEqual => 39,
+            TokenKind::PercentEqual => 40,
+            TokenKind::AmperEqual => 41,
+            TokenKind::VbarEqual => 42,
+            TokenKind::CircumflexEqual => 43,
+            TokenKind::LeftShiftEqual => 44,
+            TokenKind::RightShiftEqual => 45,
+            TokenKind::DoubleStarEqual => 46,
+            TokenKind::DoubleSlash => 47,
+            TokenKind::DoubleSlashEqual => 48,
+            TokenKind::At => 49,
+            TokenKind::AtEqual => 50,
+            TokenKind::Rarrow => 51,
+            TokenKind::Ellipsis => 52,
+            TokenKind::ColonEqual => 53,
+            TokenKind::Exclamation => 54,
+            TokenKind::FStringStart => 59,
+            TokenKind::FStringMiddle => 60,
+            TokenKind::FStringEnd => 61,
+            TokenKind::Comment => TOKEN_COMMENT,
+            TokenKind::TStringStart => 62,
+            TokenKind::TStringMiddle => 63,
+            TokenKind::TStringEnd => 64,
+            TokenKind::IpyEscapeCommand
+            | TokenKind::Question
+            | TokenKind::Unknown => 67, // ERRORTOKEN
+        }
+    }
+}
diff --git a/crates/stdlib/src/lib.rs b/crates/stdlib/src/lib.rs
index 04aa623d185..4c06eea9ef4 100644
--- a/crates/stdlib/src/lib.rs
+++ b/crates/stdlib/src/lib.rs
@@ -49,7 +49,8 @@ mod pystruct;
 mod random;
 mod statistics;
 mod suggestions;
-mod tokenize;
+#[path = "_tokenize.rs"]
+mod _tokenize;
 // TODO: maybe make this an extension module, if we ever get those
 // mod re;
 #[cfg(all(feature = "host_env", not(target_arch = "wasm32")))]
@@ -226,7 +227,7 @@ pub fn stdlib_module_defs(ctx: &Context) -> Vec<&'static builtins::PyModuleDef>
         ssl::module_def(ctx),
         statistics::module_def(ctx),
         suggestions::module_def(ctx),
-        tokenize::module_def(ctx),
+        _tokenize::module_def(ctx),
         #[cfg(all(feature = "host_env", unix, not(target_os = "redox")))]
         syslog::module_def(ctx),
         #[cfg(all(
diff --git a/crates/stdlib/src/tokenize.rs b/crates/stdlib/src/tokenize.rs
deleted file mode 100644
index 33667a203ec..00000000000
--- a/crates/stdlib/src/tokenize.rs
+++ /dev/null
@@ -1,391 +0,0 @@
-pub(crate) use _tokenize::module_def;
-
-#[pymodule]
-mod _tokenize {
-    use crate::{
-        common::lock::PyRwLock,
-        vm::{
-            AsObject, Py, PyPayload, PyResult, VirtualMachine,
-            builtins::{PyBytes, PyStr, PyType},
-            convert::ToPyObject,
-            function::ArgCallable,
-            protocol::PyIterReturn,
-            types::{Constructor, IterNext, Iterable, SelfIter},
-        },
-    };
-    use ruff_python_ast::PySourceType;
-    use ruff_python_ast::token::{Token, TokenKind, Tokens};
-    use ruff_python_parser::{ParseError, parse_unchecked_source};
-    use ruff_source_file::{LineIndex, LineRanges};
-    use ruff_text_size::{Ranged, TextRange};
-    use std::{cmp::Ordering, fmt};
-
-    /// `__import__("token").OP`
-    const TOKEN_OP: u8 = 55;
-
-    #[pyattr]
-    #[pyclass(name = "TokenizerIter")]
-    #[derive(PyPayload)]
-    pub struct PyTokenizerIter {
-        readline: ArgCallable, // TODO: This should be PyObject
-        extra_tokens: bool,
-        encoding: Option<String>,
-        state: PyRwLock<PyTokenizerIterState>,
-    }
-
-    impl PyTokenizerIter {
-        fn readline(&self, vm: &VirtualMachine) -> PyResult<String> {
-            // TODO: When `readline` is PyObject,
-            // we need to check if it's callable and raise a type error if it's not.
-            let raw_line = match self.readline.invoke((), vm) {
-                Ok(v) => v,
-                Err(err) => {
-                    if err.fast_isinstance(vm.ctx.exceptions.stop_iteration) {
-                        return Ok(String::new());
-                    }
-                    return Err(err);
-                }
-            };
-            Ok(match &self.encoding {
-                Some(encoding) => {
-                    let bytes = raw_line
-                        .downcast::<PyBytes>()
-                        .map_err(|_| vm.new_type_error("readline() returned a non-bytes object"))?;
-                    vm.state
-                        .codec_registry
-                        .decode_text(bytes.into(), encoding, None, vm)
-                        .map(|s| s.to_string())?
-                }
-                None => raw_line
-                    .downcast::<PyStr>()
-                    .map(|s| s.to_string())
-                    .map_err(|_| vm.new_type_error("readline() returned a non-string object"))?,
-            })
-        }
-    }
-
-    impl fmt::Debug for PyTokenizerIter {
-        fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-            f.debug_struct("PyTokenizerIter")
-                .field("readline", &self.readline)
-                .field("encoding", &self.encoding)
-                .field("extra_tokens", &self.extra_tokens)
-                .finish()
-        }
-    }
-
-    #[pyclass(with(Constructor, Iterable, IterNext))]
-    impl PyTokenizerIter {}
-
-    impl Constructor for PyTokenizerIter {
-        type Args = PyTokenizerIterArgs;
-
-        fn py_new(_cls: &Py<PyType>, args: Self::Args, _vm: &VirtualMachine) -> PyResult<Self> {
-            let Self::Args {
-                readline,
-                extra_tokens,
-                encoding,
-            } = args;
-
-            Ok(Self {
-                readline,
-                extra_tokens,
-                encoding: encoding.map(|s| s.to_string()),
-                state: PyRwLock::new(PyTokenizerIterState::default()),
-            })
-        }
-    }
-
-    impl SelfIter for PyTokenizerIter {}
-
-    impl IterNext for PyTokenizerIter {
-        fn next(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<PyIterReturn> {
-            let mut state = {
-                let guard = zelf.state.read();
-                guard.clone()
-            };
-
-            if state.eof {
-                return Ok(PyIterReturn::StopIteration(None));
-            }
-
-            let token = loop {
-                // TODO: Check here for errors. Raise SyntaxError if needed
-
-                if let Some(tok) = state.next_token() {
-                    break tok;
-                }
-
-                let nline = zelf.readline(vm)?;
-                if nline.is_empty() {
-                    state.eof = true;
-                    *zelf.state.write() = state.clone();
-
-                    let line_num = &state.start().0;
-                    let out = vm
-                        .ctx
-                        .new_tuple(vec![
-                            token_kind_value(TokenKind::EndOfFile).to_pyobject(vm),
-                            vm.ctx.new_str("").into(),
-                            vm.ctx
-                                .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)])
-                                .into(),
-                            vm.ctx
-                                .new_tuple(vec![line_num.to_pyobject(vm), (-1).to_pyobject(vm)])
-                                .into(),
-                            vm.ctx.new_str(state.current_line()).into(),
-                        ])
-                        .into();
-                    return Ok(PyIterReturn::Return(out));
-                }
-                state.push_line(&nline);
-            };
-
-            *zelf.state.write() = state.clone();
-
-            let token_kind = token.kind();
-            let token_value = if zelf.extra_tokens && token_kind.is_operator() {
-                TOKEN_OP
-            } else {
-                token_kind_value(token_kind)
-            };
-            let (start_x, start_y) = &state.start();
-            let (end_x, end_y) = &state.end();
-
-            let mut token_repr = &state.source[state.range()];
-            if !zelf.extra_tokens {
-                token_repr = token_repr.trim();
-            }
-
-            let out = vm
-                .ctx
-                .new_tuple(vec![
-                    token_value.to_pyobject(vm),
-                    vm.ctx.new_str(token_repr).into(),
-                    vm.ctx
-                        .new_tuple(vec![start_x.to_pyobject(vm), start_y.to_pyobject(vm)])
-                        .into(),
-                    vm.ctx
-                        .new_tuple(vec![end_x.to_pyobject(vm), end_y.to_pyobject(vm)])
-                        .into(),
-                    vm.ctx.new_str(state.current_line()).into(),
-                ])
-                .into();
-            Ok(PyIterReturn::Return(out))
-        }
-    }
-
-    #[derive(FromArgs)]
-    pub struct PyTokenizerIterArgs {
-        #[pyarg(positional)]
-        readline: ArgCallable,
-        #[pyarg(named)]
-        extra_tokens: bool,
-        #[pyarg(named, optional)]
-        encoding: Option<rustpython_vm::PyRef<PyStr>>,
-    }
-
-    #[derive(Clone, Debug)]
-    struct PyTokenizerIterState {
-        /// Source code.
-        source: String,
-        prev_token: Option<Token>,
-        /// Tokens of `source`.
-        tokens: Tokens,
-        /// Errors of `source`
-        errors: Vec<ParseError>,
-        /// LineIndex of `source`.
-        line_index: LineIndex,
-        /// Marker that says we already emitted EOF, and needs to stop iterating.
-        eof: bool,
-    }
-
-    impl PyTokenizerIterState {
-        fn push_line(&mut self, line: &str) {
-            self.source.push_str(line);
-
-            let parsed = parse_unchecked_source(&self.source, PySourceType::Python);
-            self.tokens = parsed.tokens().clone();
-            self.errors = parsed.errors().to_vec();
-            self.line_index = LineIndex::from_source_text(&self.source);
-        }
-
-        #[must_use]
-        fn current_line(&self) -> &str {
-            let (kind, range): (TokenKind, TextRange) = match self.prev_token {
-                Some(token) => token.as_tuple(),
-                None => (TokenKind::Unknown, TextRange::default()),
-            };
-
-            match kind {
-                TokenKind::Newline => self.source.full_line_str(range.start()),
-                _ => self.source.full_lines_str(range),
-            }
-        }
-
-        #[must_use]
-        fn next_token(&mut self) -> Option<Token> {
-            for token in self.tokens.iter() {
-                let (kind, range): (TokenKind, TextRange) = token.as_tuple();
-
-                if matches!(kind, TokenKind::NonLogicalNewline) {
-                    continue;
-                }
-
-                if matches!(range.ordering(self.range()), Ordering::Greater) {
-                    self.prev_token = Some(*token);
-                    return self.prev_token;
-                }
-            }
-
-            None
-        }
-
-        #[must_use]
-        fn range(&self) -> TextRange {
-            match self.prev_token {
-                Some(token) => token.range(),
-                None => TextRange::default(),
-            }
-        }
-
-        #[must_use]
-        fn start(&self) -> (usize, usize) {
-            let lc = self
-                .line_index
-                .line_column(self.range().start(), &self.source);
-            (lc.line.get(), lc.column.to_zero_indexed())
-        }
-
-        #[must_use]
-        fn end(&self) -> (usize, usize) {
-            let lc = self
-                .line_index
-                .line_column(self.range().end(), &self.source);
-            (lc.line.get(), lc.column.to_zero_indexed())
-        }
-    }
-
-    impl Default for PyTokenizerIterState {
-        fn default() -> Self {
-            const SOURCE: &str = "";
-            let parsed = parse_unchecked_source(SOURCE, PySourceType::Python);
-
-            Self {
-                source: SOURCE.to_owned(),
-                prev_token: None,
-                tokens: parsed.tokens().clone(),
-                errors: parsed.errors().to_vec(),
-                line_index: LineIndex::from_source_text(SOURCE),
-                eof: false,
-            }
-        }
-    }
-
-    const fn token_kind_value(kind: TokenKind) -> u8 {
-        match kind {
-            TokenKind::EndOfFile => 0,
-            TokenKind::Name
-            | TokenKind::For
-            | TokenKind::In
-            | TokenKind::Pass
-            | TokenKind::Class
-            | TokenKind::And
-            | TokenKind::Is
-            | TokenKind::Raise
-            | TokenKind::True
-            | TokenKind::False
-            | TokenKind::Assert
-            | TokenKind::Try
-            | TokenKind::While
-            | TokenKind::Yield
-            | TokenKind::Lambda
-            | TokenKind::None
-            | TokenKind::Not
-            | TokenKind::Or
-            | TokenKind::Break
-            | TokenKind::Continue
-            | TokenKind::Global
-            | TokenKind::Nonlocal
-            | TokenKind::Return
-            | TokenKind::Except
-            | TokenKind::Import
-            | TokenKind::Case
-            | TokenKind::Match
-            | TokenKind::Type
-            | TokenKind::Await
-            | TokenKind::With
-            | TokenKind::Del
-            | TokenKind::Finally
-            | TokenKind::From
-            | TokenKind::Def
-            | TokenKind::If
-            | TokenKind::Else
-            | TokenKind::Elif
-            | TokenKind::As
-            | TokenKind::Async => 1,
-            TokenKind::Int | TokenKind::Complex | TokenKind::Float => 2,
-            TokenKind::String => 3,
-            TokenKind::Newline | TokenKind::NonLogicalNewline => 4,
-            TokenKind::Indent => 5,
-            TokenKind::Dedent => 6,
-            TokenKind::Lpar => 7,
-            TokenKind::Rpar => 8,
-            TokenKind::Lsqb => 9,
-            TokenKind::Rsqb => 10,
-            TokenKind::Colon => 11,
-            TokenKind::Comma => 12,
-            TokenKind::Semi => 13,
-            TokenKind::Plus => 14,
-            TokenKind::Minus => 15,
-            TokenKind::Star => 16,
-            TokenKind::Slash => 17,
-            TokenKind::Vbar => 18,
-            TokenKind::Amper => 19,
-            TokenKind::Less => 20,
-            TokenKind::Greater => 21,
-            TokenKind::Equal => 22,
-            TokenKind::Dot => 23,
-            TokenKind::Percent => 24,
-            TokenKind::Lbrace => 25,
-            TokenKind::Rbrace => 26,
-            TokenKind::EqEqual => 27,
-            TokenKind::NotEqual => 28,
-            TokenKind::LessEqual => 29,
-            TokenKind::GreaterEqual => 30,
-            TokenKind::Tilde => 31,
-            TokenKind::CircumFlex => 32,
-            TokenKind::LeftShift => 33,
-            TokenKind::RightShift => 34,
-            TokenKind::DoubleStar => 35,
-            TokenKind::PlusEqual => 36,
-            TokenKind::MinusEqual => 37,
-            TokenKind::StarEqual => 38,
-            TokenKind::SlashEqual => 39,
-            TokenKind::PercentEqual => 40,
-            TokenKind::AmperEqual => 41,
-            TokenKind::VbarEqual => 42,
-            TokenKind::CircumflexEqual => 43,
-            TokenKind::LeftShiftEqual => 44,
-            TokenKind::RightShiftEqual => 45,
-            TokenKind::DoubleStarEqual => 46,
-            TokenKind::DoubleSlash => 47,
-            TokenKind::DoubleSlashEqual => 48,
-            TokenKind::At => 49,
-            TokenKind::AtEqual => 50,
-            TokenKind::Rarrow => 51,
-            TokenKind::Ellipsis => 52,
-            TokenKind::ColonEqual => 53,
-            TokenKind::Exclamation => 54,
-            TokenKind::FStringStart => 59,
-            TokenKind::FStringMiddle => 60,
-            TokenKind::FStringEnd => 61,
-            TokenKind::Comment => 62,
-            TokenKind::TStringStart => 62,  // 3.14 compatible
-            TokenKind::TStringMiddle => 63, // 3.14 compatible
-            TokenKind::TStringEnd => 64,    // 3.14 compatible
-            TokenKind::IpyEscapeCommand | TokenKind::Question => 0, // Ruff's specific
-            TokenKind::Unknown => 0,
-        }
-    }
-}