From 9781563ef636bcc76483a329b983cb062d1371c6 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Mon, 5 Feb 2024 08:00:18 -0800 Subject: [PATCH] Add fast-path for comment detection (#9808) ## Summary When we fall through to parsing, the comment-detection rule is a significant portion of lint time. This PR adds an additional fast heuristic whereby we abort if a comment contains two consecutive name tokens (via the zero-allocation lexer). For the `ctypeslib.py`, which has a few cases that are now caught by this, it's a 2.5x speedup for the rule (and a 20% speedup for token-based rules). --- .../src/rules/eradicate/detection.rs | 17 +++- ...__identifier_ending_in_non_start_char.snap | 2 +- ..._identifier_starting_with_string_kind.snap | 18 +++++ ...kenizer__tests__string_with_byte_kind.snap | 14 ++++ ...izer__tests__string_with_invalid_kind.snap | 18 +++++ ...a__tokenizer__tests__string_with_kind.snap | 14 ++++ ...via__tokenizer__tests__tricky_unicode.snap | 2 +- crates/ruff_python_trivia/src/tokenizer.rs | 80 ++++++++++++++++++- 8 files changed, 157 insertions(+), 8 deletions(-) create mode 100644 crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_starting_with_string_kind.snap create mode 100644 crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_byte_kind.snap create mode 100644 crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_invalid_kind.snap create mode 100644 crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_kind.snap diff --git a/crates/ruff_linter/src/rules/eradicate/detection.rs b/crates/ruff_linter/src/rules/eradicate/detection.rs index 95e77fb181d9cc..c9e3b06a99209c 100644 --- a/crates/ruff_linter/src/rules/eradicate/detection.rs +++ b/crates/ruff_linter/src/rules/eradicate/detection.rs @@ -1,14 +1,16 @@ /// See: [eradicate.py](https://github.com/myint/eradicate/blob/98f199940979c94447a461d50d27862b118b282d/eradicate.py) use aho_corasick::AhoCorasick; +use itertools::Itertools; use once_cell::sync::Lazy; use regex::{Regex, RegexSet}; use ruff_python_parser::parse_suite; +use ruff_python_trivia::{SimpleTokenKind, SimpleTokenizer}; +use ruff_text_size::TextSize; static CODE_INDICATORS: Lazy = Lazy::new(|| { AhoCorasick::new([ - "(", ")", "[", "]", "{", "}", ":", "=", "%", "print", "return", "break", "continue", - "import", + "(", ")", "[", "]", "{", "}", ":", "=", "%", "return", "break", "continue", "import", ]) .unwrap() }); @@ -44,6 +46,14 @@ pub(crate) fn comment_contains_code(line: &str, task_tags: &[String]) -> bool { return false; } + // Fast path: if the comment contains consecutive identifiers, we know it won't parse. + let tokenizer = SimpleTokenizer::starts_at(TextSize::default(), line).skip_trivia(); + if tokenizer.tuple_windows().any(|(first, second)| { + first.kind == SimpleTokenKind::Name && second.kind == SimpleTokenKind::Name + }) { + return false; + } + // Ignore task tag comments (e.g., "# TODO(tom): Refactor"). if line .split(&[' ', ':', '(']) @@ -123,9 +133,10 @@ mod tests { #[test] fn comment_contains_code_with_print() { - assert!(comment_contains_code("#print", &[])); assert!(comment_contains_code("#print(1)", &[])); + assert!(!comment_contains_code("#print", &[])); + assert!(!comment_contains_code("#print 1", &[])); assert!(!comment_contains_code("#to print", &[])); } diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_ending_in_non_start_char.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_ending_in_non_start_char.snap index e5fd7c7c4017ca..7b3494de286b63 100644 --- a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_ending_in_non_start_char.snap +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_ending_in_non_start_char.snap @@ -4,7 +4,7 @@ expression: test_case.tokens() --- [ SimpleToken { - kind: Other, + kind: Name, range: 0..2, }, ] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_starting_with_string_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_starting_with_string_kind.snap new file mode 100644 index 00000000000000..6d8bfb8f06ad65 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__identifier_starting_with_string_kind.snap @@ -0,0 +1,18 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: Name, + range: 0..3, + }, + SimpleToken { + kind: Whitespace, + range: 3..4, + }, + SimpleToken { + kind: Name, + range: 4..7, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_byte_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_byte_kind.snap new file mode 100644 index 00000000000000..5b535116cbb3fc --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_byte_kind.snap @@ -0,0 +1,14 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: Other, + range: 0..2, + }, + SimpleToken { + kind: Bogus, + range: 2..7, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_invalid_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_invalid_kind.snap new file mode 100644 index 00000000000000..cab8e69c89f4af --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_invalid_kind.snap @@ -0,0 +1,18 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: Name, + range: 0..3, + }, + SimpleToken { + kind: Other, + range: 3..4, + }, + SimpleToken { + kind: Bogus, + range: 4..8, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_kind.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_kind.snap new file mode 100644 index 00000000000000..a3030fc9af35b7 --- /dev/null +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__string_with_kind.snap @@ -0,0 +1,14 @@ +--- +source: crates/ruff_python_trivia/src/tokenizer.rs +expression: test_case.tokens() +--- +[ + SimpleToken { + kind: Other, + range: 0..1, + }, + SimpleToken { + kind: Bogus, + range: 1..6, + }, +] diff --git a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tricky_unicode.snap b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tricky_unicode.snap index 37678a05d39ede..81864953b5015f 100644 --- a/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tricky_unicode.snap +++ b/crates/ruff_python_trivia/src/snapshots/ruff_python_trivia__tokenizer__tests__tricky_unicode.snap @@ -4,7 +4,7 @@ expression: test_case.tokens() --- [ SimpleToken { - kind: Other, + kind: Name, range: 0..6, }, ] diff --git a/crates/ruff_python_trivia/src/tokenizer.rs b/crates/ruff_python_trivia/src/tokenizer.rs index dad44dd51ae002..d1135bb9bca4f7 100644 --- a/crates/ruff_python_trivia/src/tokenizer.rs +++ b/crates/ruff_python_trivia/src/tokenizer.rs @@ -182,7 +182,7 @@ fn to_keyword_or_other(source: &str) -> SimpleTokenKind { "case" => SimpleTokenKind::Case, "with" => SimpleTokenKind::With, "yield" => SimpleTokenKind::Yield, - _ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals + _ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint. } } @@ -467,6 +467,9 @@ pub enum SimpleTokenKind { /// `yield` Yield, + /// An identifier or keyword. + Name, + /// Any other non trivia token. Other, @@ -566,10 +569,42 @@ impl<'a> SimpleTokenizer<'a> { let range = TextRange::at(self.offset, token_len); let kind = to_keyword_or_other(&self.source[range]); - if kind == SimpleTokenKind::Other { + // If the next character is a quote, we may be in a string prefix. For example: + // `f"foo`. + if kind == SimpleTokenKind::Name + && matches!(self.cursor.first(), '"' | '\'') + && matches!( + &self.source[range], + "B" | "BR" + | "Br" + | "F" + | "FR" + | "Fr" + | "R" + | "RB" + | "RF" + | "Rb" + | "Rf" + | "U" + | "b" + | "bR" + | "br" + | "f" + | "fR" + | "fr" + | "r" + | "rB" + | "rF" + | "rb" + | "rf" + | "u" + ) + { self.bogus = true; + SimpleTokenKind::Other + } else { + kind } - kind } // Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as @@ -1153,6 +1188,45 @@ mod tests { test_case.assert_reverse_tokenization(); } + #[test] + fn string_with_kind() { + let source = "f'foo'"; + + let test_case = tokenize(source); + assert_debug_snapshot!(test_case.tokens()); + + // note: not reversible: [other, bogus] vs [bogus, other] + } + + #[test] + fn string_with_byte_kind() { + let source = "BR'foo'"; + + let test_case = tokenize(source); + assert_debug_snapshot!(test_case.tokens()); + + // note: not reversible: [other, bogus] vs [bogus, other] + } + + #[test] + fn string_with_invalid_kind() { + let source = "abc'foo'"; + + let test_case = tokenize(source); + assert_debug_snapshot!(test_case.tokens()); + + // note: not reversible: [other, bogus] vs [bogus, other] + } + + #[test] + fn identifier_starting_with_string_kind() { + let source = "foo bar"; + + let test_case = tokenize(source); + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + #[test] fn ignore_word_with_only_id_continuing_chars() { let source = "555";