From 698f8c2f01ea549d77d7dc3338a12e04c11057b9 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 17 Apr 2024 14:02:58 +0200 Subject: Adding upstream version 1.64.0+dfsg1. Signed-off-by: Daniel Baumann --- compiler/rustc_lint/src/non_ascii_idents.rs | 345 ++++++++++++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 compiler/rustc_lint/src/non_ascii_idents.rs (limited to 'compiler/rustc_lint/src/non_ascii_idents.rs') diff --git a/compiler/rustc_lint/src/non_ascii_idents.rs b/compiler/rustc_lint/src/non_ascii_idents.rs new file mode 100644 index 000000000..764003e61 --- /dev/null +++ b/compiler/rustc_lint/src/non_ascii_idents.rs @@ -0,0 +1,345 @@ +use crate::{EarlyContext, EarlyLintPass, LintContext}; +use rustc_ast as ast; +use rustc_data_structures::fx::FxHashMap; +use rustc_errors::fluent; +use rustc_span::symbol::Symbol; + +declare_lint! { + /// The `non_ascii_idents` lint detects non-ASCII identifiers. + /// + /// ### Example + /// + /// ```rust,compile_fail + /// # #![allow(unused)] + /// #![deny(non_ascii_idents)] + /// fn main() { + /// let föö = 1; + /// } + /// ``` + /// + /// {{produces}} + /// + /// ### Explanation + /// + /// This lint allows projects that wish to retain the limit of only using + /// ASCII characters to switch this lint to "forbid" (for example to ease + /// collaboration or for security reasons). + /// See [RFC 2457] for more details. + /// + /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md + pub NON_ASCII_IDENTS, + Allow, + "detects non-ASCII identifiers", + crate_level_only +} + +declare_lint! { + /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in + /// identifiers. + /// + /// ### Example + /// + /// ```rust + /// # #![allow(unused)] + /// const µ: f64 = 0.000001; + /// ``` + /// + /// {{produces}} + /// + /// ### Explanation + /// + /// This lint warns about using characters which are not commonly used, and may + /// cause visual confusion. + /// + /// This lint is triggered by identifiers that contain a codepoint that is + /// not part of the set of "Allowed" codepoints as described by [Unicode® + /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General + /// Security Profile for Identifiers][TR39Allowed]. + /// + /// Note that the set of uncommon codepoints may change over time. Beware + /// that if you "forbid" this lint that existing code may fail in the + /// future. + /// + /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile + pub UNCOMMON_CODEPOINTS, + Warn, + "detects uncommon Unicode codepoints in identifiers", + crate_level_only +} + +declare_lint! { + /// The `confusable_idents` lint detects visually confusable pairs between + /// identifiers. + /// + /// ### Example + /// + /// ```rust + /// // Latin Capital Letter E With Caron + /// pub const Ě: i32 = 1; + /// // Latin Capital Letter E With Breve + /// pub const Ĕ: i32 = 2; + /// ``` + /// + /// {{produces}} + /// + /// ### Explanation + /// + /// This lint warns when different identifiers may appear visually similar, + /// which can cause confusion. + /// + /// The confusable detection algorithm is based on [Unicode® Technical + /// Standard #39 Unicode Security Mechanisms Section 4 Confusable + /// Detection][TR39Confusable]. For every distinct identifier X execute + /// the function `skeleton(X)`. If there exist two distinct identifiers X + /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it. + /// The compiler uses the same mechanism to check if an identifier is too + /// similar to a keyword. + /// + /// Note that the set of confusable characters may change over time. + /// Beware that if you "forbid" this lint that existing code may fail in + /// the future. + /// + /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection + pub CONFUSABLE_IDENTS, + Warn, + "detects visually confusable pairs between identifiers", + crate_level_only +} + +declare_lint! { + /// The `mixed_script_confusables` lint detects visually confusable + /// characters in identifiers between different [scripts]. + /// + /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode) + /// + /// ### Example + /// + /// ```rust + /// // The Japanese katakana character エ can be confused with the Han character 工. + /// const エ: &'static str = "アイウ"; + /// ``` + /// + /// {{produces}} + /// + /// ### Explanation + /// + /// This lint warns when characters between different scripts may appear + /// visually similar, which can cause confusion. + /// + /// If the crate contains other identifiers in the same script that have + /// non-confusable characters, then this lint will *not* be issued. For + /// example, if the example given above has another identifier with + /// katakana characters (such as `let カタカナ = 123;`), then this indicates + /// that you are intentionally using katakana, and it will not warn about + /// it. + /// + /// Note that the set of confusable characters may change over time. + /// Beware that if you "forbid" this lint that existing code may fail in + /// the future. + pub MIXED_SCRIPT_CONFUSABLES, + Warn, + "detects Unicode scripts whose mixed script confusables codepoints are solely used", + crate_level_only +} + +declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]); + +impl EarlyLintPass for NonAsciiIdents { + fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) { + use rustc_session::lint::Level; + use rustc_span::Span; + use std::collections::BTreeMap; + use unicode_security::GeneralSecurityProfile; + + let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow; + let check_uncommon_codepoints = + cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow; + let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow; + let check_mixed_script_confusables = + cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow; + + if !check_non_ascii_idents + && !check_uncommon_codepoints + && !check_confusable_idents + && !check_mixed_script_confusables + { + return; + } + + let mut has_non_ascii_idents = false; + let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock(); + + // Sort by `Span` so that error messages make sense with respect to the + // order of identifier locations in the code. + let mut symbols: Vec<_> = symbols.iter().collect(); + symbols.sort_by_key(|k| k.1); + + for (symbol, &sp) in symbols.iter() { + let symbol_str = symbol.as_str(); + if symbol_str.is_ascii() { + continue; + } + has_non_ascii_idents = true; + cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| { + lint.build(fluent::lint::identifier_non_ascii_char).emit(); + }); + if check_uncommon_codepoints + && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed) + { + cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| { + lint.build(fluent::lint::identifier_uncommon_codepoints).emit(); + }) + } + } + + if has_non_ascii_idents && check_confusable_idents { + let mut skeleton_map: FxHashMap = + FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default()); + let mut skeleton_buf = String::new(); + + for (&symbol, &sp) in symbols.iter() { + use unicode_security::confusable_detection::skeleton; + + let symbol_str = symbol.as_str(); + let is_ascii = symbol_str.is_ascii(); + + // Get the skeleton as a `Symbol`. + skeleton_buf.clear(); + skeleton_buf.extend(skeleton(&symbol_str)); + let skeleton_sym = if *symbol_str == *skeleton_buf { + symbol + } else { + Symbol::intern(&skeleton_buf) + }; + + skeleton_map + .entry(skeleton_sym) + .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| { + if !*existing_is_ascii || !is_ascii { + cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| { + lint.build(fluent::lint::confusable_identifier_pair) + .set_arg("existing_sym", *existing_symbol) + .set_arg("sym", symbol) + .span_label(*existing_span, fluent::lint::label) + .emit(); + }); + } + if *existing_is_ascii && !is_ascii { + *existing_symbol = symbol; + *existing_span = sp; + *existing_is_ascii = is_ascii; + } + }) + .or_insert((symbol, sp, is_ascii)); + } + } + + if has_non_ascii_idents && check_mixed_script_confusables { + use unicode_security::is_potential_mixed_script_confusable_char; + use unicode_security::mixed_script::AugmentedScriptSet; + + #[derive(Clone)] + enum ScriptSetUsage { + Suspicious(Vec, Span), + Verified, + } + + let mut script_states: FxHashMap = + FxHashMap::default(); + let latin_augmented_script_set = AugmentedScriptSet::for_char('A'); + script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified); + + let mut has_suspicous = false; + for (symbol, &sp) in symbols.iter() { + let symbol_str = symbol.as_str(); + for ch in symbol_str.chars() { + if ch.is_ascii() { + // all ascii characters are covered by exception. + continue; + } + if !GeneralSecurityProfile::identifier_allowed(ch) { + // this character is covered by `uncommon_codepoints` lint. + continue; + } + let augmented_script_set = AugmentedScriptSet::for_char(ch); + script_states + .entry(augmented_script_set) + .and_modify(|existing_state| { + if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state { + if is_potential_mixed_script_confusable_char(ch) { + ch_list.push(ch); + } else { + *existing_state = ScriptSetUsage::Verified; + } + } + }) + .or_insert_with(|| { + if !is_potential_mixed_script_confusable_char(ch) { + ScriptSetUsage::Verified + } else { + has_suspicous = true; + ScriptSetUsage::Suspicious(vec![ch], sp) + } + }); + } + } + + if has_suspicous { + let verified_augmented_script_sets = script_states + .iter() + .flat_map(|(k, v)| match v { + ScriptSetUsage::Verified => Some(*k), + _ => None, + }) + .collect::>(); + + // we're sorting the output here. + let mut lint_reports: BTreeMap<(Span, Vec), AugmentedScriptSet> = + BTreeMap::new(); + + 'outerloop: for (augment_script_set, usage) in script_states { + let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue }; + + if augment_script_set.is_all() { + continue; + } + + for existing in verified_augmented_script_sets.iter() { + if existing.is_all() { + continue; + } + let mut intersect = *existing; + intersect.intersect_with(augment_script_set); + if !intersect.is_empty() && !intersect.is_all() { + continue 'outerloop; + } + } + + // We sort primitive chars here and can use unstable sort + ch_list.sort_unstable(); + ch_list.dedup(); + lint_reports.insert((sp, ch_list), augment_script_set); + } + + for ((sp, ch_list), script_set) in lint_reports { + cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| { + let mut includes = String::new(); + for (idx, ch) in ch_list.into_iter().enumerate() { + if idx != 0 { + includes += ", "; + } + let char_info = format!("'{}' (U+{:04X})", ch, ch as u32); + includes += &char_info; + } + lint.build(fluent::lint::mixed_script_confusables) + .set_arg("set", script_set.to_string()) + .set_arg("includes", includes) + .note(fluent::lint::includes_note) + .note(fluent::lint::note) + .emit(); + }); + } + } + } + } +} -- cgit v1.2.3