no-misleading-character-class.js 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537
  1. /**
  2. * @author Toru Nagashima <https://github.com/mysticatea>
  3. */
  4. "use strict";
  5. const {
  6. CALL,
  7. CONSTRUCT,
  8. ReferenceTracker,
  9. getStaticValue,
  10. getStringIfConstant
  11. } = require("@eslint-community/eslint-utils");
  12. const { RegExpParser, visitRegExpAST } = require("@eslint-community/regexpp");
  13. const { isCombiningCharacter, isEmojiModifier, isRegionalIndicatorSymbol, isSurrogatePair } = require("./utils/unicode");
  14. const astUtils = require("./utils/ast-utils.js");
  15. const { isValidWithUnicodeFlag } = require("./utils/regular-expressions");
  16. const { parseStringLiteral, parseTemplateToken } = require("./utils/char-source");
  17. //------------------------------------------------------------------------------
  18. // Helpers
  19. //------------------------------------------------------------------------------
  20. /**
  21. * @typedef {import('@eslint-community/regexpp').AST.Character} Character
  22. * @typedef {import('@eslint-community/regexpp').AST.CharacterClassElement} CharacterClassElement
  23. */
  24. /**
  25. * Iterate character sequences of a given nodes.
  26. *
  27. * CharacterClassRange syntax can steal a part of character sequence,
  28. * so this function reverts CharacterClassRange syntax and restore the sequence.
  29. * @param {CharacterClassElement[]} nodes The node list to iterate character sequences.
  30. * @returns {IterableIterator<Character[]>} The list of character sequences.
  31. */
  32. function *iterateCharacterSequence(nodes) {
  33. /** @type {Character[]} */
  34. let seq = [];
  35. for (const node of nodes) {
  36. switch (node.type) {
  37. case "Character":
  38. seq.push(node);
  39. break;
  40. case "CharacterClassRange":
  41. seq.push(node.min);
  42. yield seq;
  43. seq = [node.max];
  44. break;
  45. case "CharacterSet":
  46. case "CharacterClass": // [[]] nesting character class
  47. case "ClassStringDisjunction": // \q{...}
  48. case "ExpressionCharacterClass": // [A--B]
  49. if (seq.length > 0) {
  50. yield seq;
  51. seq = [];
  52. }
  53. break;
  54. // no default
  55. }
  56. }
  57. if (seq.length > 0) {
  58. yield seq;
  59. }
  60. }
  61. /**
  62. * Checks whether the given character node is a Unicode code point escape or not.
  63. * @param {Character} char the character node to check.
  64. * @returns {boolean} `true` if the character node is a Unicode code point escape.
  65. */
  66. function isUnicodeCodePointEscape(char) {
  67. return /^\\u\{[\da-f]+\}$/iu.test(char.raw);
  68. }
  69. /**
  70. * Each function returns matched characters if it detects that kind of problem.
  71. * @type {Record<string, (chars: Character[]) => IterableIterator<Character[]>>}
  72. */
  73. const findCharacterSequences = {
  74. *surrogatePairWithoutUFlag(chars) {
  75. for (const [index, char] of chars.entries()) {
  76. const previous = chars[index - 1];
  77. if (
  78. previous && char &&
  79. isSurrogatePair(previous.value, char.value) &&
  80. !isUnicodeCodePointEscape(previous) &&
  81. !isUnicodeCodePointEscape(char)
  82. ) {
  83. yield [previous, char];
  84. }
  85. }
  86. },
  87. *surrogatePair(chars) {
  88. for (const [index, char] of chars.entries()) {
  89. const previous = chars[index - 1];
  90. if (
  91. previous && char &&
  92. isSurrogatePair(previous.value, char.value) &&
  93. (
  94. isUnicodeCodePointEscape(previous) ||
  95. isUnicodeCodePointEscape(char)
  96. )
  97. ) {
  98. yield [previous, char];
  99. }
  100. }
  101. },
  102. *combiningClass(chars, unfilteredChars) {
  103. /*
  104. * When `allowEscape` is `true`, a combined character should only be allowed if the combining mark appears as an escape sequence.
  105. * This means that the base character should be considered even if it's escaped.
  106. */
  107. for (const [index, char] of chars.entries()) {
  108. const previous = unfilteredChars[index - 1];
  109. if (
  110. previous && char &&
  111. isCombiningCharacter(char.value) &&
  112. !isCombiningCharacter(previous.value)
  113. ) {
  114. yield [previous, char];
  115. }
  116. }
  117. },
  118. *emojiModifier(chars) {
  119. for (const [index, char] of chars.entries()) {
  120. const previous = chars[index - 1];
  121. if (
  122. previous && char &&
  123. isEmojiModifier(char.value) &&
  124. !isEmojiModifier(previous.value)
  125. ) {
  126. yield [previous, char];
  127. }
  128. }
  129. },
  130. *regionalIndicatorSymbol(chars) {
  131. for (const [index, char] of chars.entries()) {
  132. const previous = chars[index - 1];
  133. if (
  134. previous && char &&
  135. isRegionalIndicatorSymbol(char.value) &&
  136. isRegionalIndicatorSymbol(previous.value)
  137. ) {
  138. yield [previous, char];
  139. }
  140. }
  141. },
  142. *zwj(chars) {
  143. let sequence = null;
  144. for (const [index, char] of chars.entries()) {
  145. const previous = chars[index - 1];
  146. const next = chars[index + 1];
  147. if (
  148. previous && char && next &&
  149. char.value === 0x200d &&
  150. previous.value !== 0x200d &&
  151. next.value !== 0x200d
  152. ) {
  153. if (sequence) {
  154. if (sequence.at(-1) === previous) {
  155. sequence.push(char, next); // append to the sequence
  156. } else {
  157. yield sequence;
  158. sequence = chars.slice(index - 1, index + 2);
  159. }
  160. } else {
  161. sequence = chars.slice(index - 1, index + 2);
  162. }
  163. }
  164. }
  165. if (sequence) {
  166. yield sequence;
  167. }
  168. }
  169. };
  170. const kinds = Object.keys(findCharacterSequences);
  171. /**
  172. * Gets the value of the given node if it's a static value other than a regular expression object,
  173. * or the node's `regex` property.
  174. * The purpose of this method is to provide a replacement for `getStaticValue` in environments where certain regular expressions cannot be evaluated.
  175. * A known example is Node.js 18 which does not support the `v` flag.
  176. * Calling `getStaticValue` on a regular expression node with the `v` flag on Node.js 18 always returns `null`.
  177. * A limitation of this method is that it can only detect a regular expression if the specified node is itself a regular expression literal node.
  178. * @param {ASTNode | undefined} node The node to be inspected.
  179. * @param {Scope} initialScope Scope to start finding variables. This function tries to resolve identifier references which are in the given scope.
  180. * @returns {{ value: any } | { regex: { pattern: string, flags: string } } | null} The static value of the node, or `null`.
  181. */
  182. function getStaticValueOrRegex(node, initialScope) {
  183. if (!node) {
  184. return null;
  185. }
  186. if (node.type === "Literal" && node.regex) {
  187. return { regex: node.regex };
  188. }
  189. const staticValue = getStaticValue(node, initialScope);
  190. if (staticValue?.value instanceof RegExp) {
  191. return null;
  192. }
  193. return staticValue;
  194. }
  195. /**
  196. * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
  197. * This function requires the source text of the character to be known.
  198. * @param {Character} char Character to check.
  199. * @param {string} charSource Source text of the character to check.
  200. * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
  201. */
  202. function checkForAcceptableEscape(char, charSource) {
  203. if (!charSource.startsWith("\\")) {
  204. return false;
  205. }
  206. const match = /(?<=^\\+).$/su.exec(charSource);
  207. return match?.[0] !== String.fromCodePoint(char.value);
  208. }
  209. /**
  210. * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
  211. * This function works with characters that are produced by a string or template literal.
  212. * It requires the source text and the CodeUnit list of the literal to be known.
  213. * @param {Character} char Character to check.
  214. * @param {string} nodeSource Source text of the string or template literal that produces the character.
  215. * @param {CodeUnit[]} codeUnits List of CodeUnit objects of the literal that produces the character.
  216. * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
  217. */
  218. function checkForAcceptableEscapeInString(char, nodeSource, codeUnits) {
  219. const firstIndex = char.start;
  220. const lastIndex = char.end - 1;
  221. const start = codeUnits[firstIndex].start;
  222. const end = codeUnits[lastIndex].end;
  223. const charSource = nodeSource.slice(start, end);
  224. return checkForAcceptableEscape(char, charSource);
  225. }
  226. //------------------------------------------------------------------------------
  227. // Rule Definition
  228. //------------------------------------------------------------------------------
  229. /** @type {import('../shared/types').Rule} */
  230. module.exports = {
  231. meta: {
  232. type: "problem",
  233. docs: {
  234. description: "Disallow characters which are made with multiple code points in character class syntax",
  235. recommended: true,
  236. url: "https://eslint.org/docs/latest/rules/no-misleading-character-class"
  237. },
  238. hasSuggestions: true,
  239. schema: [
  240. {
  241. type: "object",
  242. properties: {
  243. allowEscape: {
  244. type: "boolean",
  245. default: false
  246. }
  247. },
  248. additionalProperties: false
  249. }
  250. ],
  251. messages: {
  252. surrogatePairWithoutUFlag: "Unexpected surrogate pair in character class. Use 'u' flag.",
  253. surrogatePair: "Unexpected surrogate pair in character class.",
  254. combiningClass: "Unexpected combined character in character class.",
  255. emojiModifier: "Unexpected modified Emoji in character class.",
  256. regionalIndicatorSymbol: "Unexpected national flag in character class.",
  257. zwj: "Unexpected joined character sequence in character class.",
  258. suggestUnicodeFlag: "Add unicode 'u' flag to regex."
  259. }
  260. },
  261. create(context) {
  262. const allowEscape = context.options[0]?.allowEscape;
  263. const sourceCode = context.sourceCode;
  264. const parser = new RegExpParser();
  265. const checkedPatternNodes = new Set();
  266. /**
  267. * Verify a given regular expression.
  268. * @param {Node} node The node to report.
  269. * @param {string} pattern The regular expression pattern to verify.
  270. * @param {string} flags The flags of the regular expression.
  271. * @param {Function} unicodeFixer Fixer for missing "u" flag.
  272. * @returns {void}
  273. */
  274. function verify(node, pattern, flags, unicodeFixer) {
  275. let patternNode;
  276. try {
  277. patternNode = parser.parsePattern(
  278. pattern,
  279. 0,
  280. pattern.length,
  281. {
  282. unicode: flags.includes("u"),
  283. unicodeSets: flags.includes("v")
  284. }
  285. );
  286. } catch {
  287. // Ignore regular expressions with syntax errors
  288. return;
  289. }
  290. let codeUnits = null;
  291. /**
  292. * Checks whether a specified regexpp character is represented as an acceptable escape sequence.
  293. * For the purposes of this rule, an escape sequence is considered acceptable if it consists of one or more backslashes followed by the character being escaped.
  294. * @param {Character} char Character to check.
  295. * @returns {boolean} Whether the specified regexpp character is represented as an acceptable escape sequence.
  296. */
  297. function isAcceptableEscapeSequence(char) {
  298. if (node.type === "Literal" && node.regex) {
  299. return checkForAcceptableEscape(char, char.raw);
  300. }
  301. if (node.type === "Literal" && typeof node.value === "string") {
  302. const nodeSource = node.raw;
  303. codeUnits ??= parseStringLiteral(nodeSource);
  304. return checkForAcceptableEscapeInString(char, nodeSource, codeUnits);
  305. }
  306. if (astUtils.isStaticTemplateLiteral(node)) {
  307. const nodeSource = sourceCode.getText(node);
  308. codeUnits ??= parseTemplateToken(nodeSource);
  309. return checkForAcceptableEscapeInString(char, nodeSource, codeUnits);
  310. }
  311. return false;
  312. }
  313. const foundKindMatches = new Map();
  314. visitRegExpAST(patternNode, {
  315. onCharacterClassEnter(ccNode) {
  316. for (const unfilteredChars of iterateCharacterSequence(ccNode.elements)) {
  317. let chars;
  318. if (allowEscape) {
  319. // Replace escape sequences with null to avoid having them flagged.
  320. chars = unfilteredChars.map(char => (isAcceptableEscapeSequence(char) ? null : char));
  321. } else {
  322. chars = unfilteredChars;
  323. }
  324. for (const kind of kinds) {
  325. const matches = findCharacterSequences[kind](chars, unfilteredChars);
  326. if (foundKindMatches.has(kind)) {
  327. foundKindMatches.get(kind).push(...matches);
  328. } else {
  329. foundKindMatches.set(kind, [...matches]);
  330. }
  331. }
  332. }
  333. }
  334. });
  335. /**
  336. * Finds the report loc(s) for a range of matches.
  337. * Only literals and expression-less templates generate granular errors.
  338. * @param {Character[][]} matches Lists of individual characters being reported on.
  339. * @returns {Location[]} locs for context.report.
  340. * @see https://github.com/eslint/eslint/pull/17515
  341. */
  342. function getNodeReportLocations(matches) {
  343. if (!astUtils.isStaticTemplateLiteral(node) && node.type !== "Literal") {
  344. return matches.length ? [node.loc] : [];
  345. }
  346. return matches.map(chars => {
  347. const firstIndex = chars[0].start;
  348. const lastIndex = chars.at(-1).end - 1;
  349. let start;
  350. let end;
  351. if (node.type === "TemplateLiteral") {
  352. const source = sourceCode.getText(node);
  353. const offset = node.range[0];
  354. codeUnits ??= parseTemplateToken(source);
  355. start = offset + codeUnits[firstIndex].start;
  356. end = offset + codeUnits[lastIndex].end;
  357. } else if (typeof node.value === "string") { // String Literal
  358. const source = node.raw;
  359. const offset = node.range[0];
  360. codeUnits ??= parseStringLiteral(source);
  361. start = offset + codeUnits[firstIndex].start;
  362. end = offset + codeUnits[lastIndex].end;
  363. } else { // RegExp Literal
  364. const offset = node.range[0] + 1; // Add 1 to skip the leading slash.
  365. start = offset + firstIndex;
  366. end = offset + lastIndex + 1;
  367. }
  368. return {
  369. start: sourceCode.getLocFromIndex(start),
  370. end: sourceCode.getLocFromIndex(end)
  371. };
  372. });
  373. }
  374. for (const [kind, matches] of foundKindMatches) {
  375. let suggest;
  376. if (kind === "surrogatePairWithoutUFlag") {
  377. suggest = [{
  378. messageId: "suggestUnicodeFlag",
  379. fix: unicodeFixer
  380. }];
  381. }
  382. const locs = getNodeReportLocations(matches);
  383. for (const loc of locs) {
  384. context.report({
  385. node,
  386. loc,
  387. messageId: kind,
  388. suggest
  389. });
  390. }
  391. }
  392. }
  393. return {
  394. "Literal[regex]"(node) {
  395. if (checkedPatternNodes.has(node)) {
  396. return;
  397. }
  398. verify(node, node.regex.pattern, node.regex.flags, fixer => {
  399. if (!isValidWithUnicodeFlag(context.languageOptions.ecmaVersion, node.regex.pattern)) {
  400. return null;
  401. }
  402. return fixer.insertTextAfter(node, "u");
  403. });
  404. },
  405. "Program"(node) {
  406. const scope = sourceCode.getScope(node);
  407. const tracker = new ReferenceTracker(scope);
  408. /*
  409. * Iterate calls of RegExp.
  410. * E.g., `new RegExp()`, `RegExp()`, `new window.RegExp()`,
  411. * `const {RegExp: a} = window; new a()`, etc...
  412. */
  413. for (const { node: refNode } of tracker.iterateGlobalReferences({
  414. RegExp: { [CALL]: true, [CONSTRUCT]: true }
  415. })) {
  416. let pattern, flags;
  417. const [patternNode, flagsNode] = refNode.arguments;
  418. const evaluatedPattern = getStaticValueOrRegex(patternNode, scope);
  419. if (!evaluatedPattern) {
  420. continue;
  421. }
  422. if (flagsNode) {
  423. if (evaluatedPattern.regex) {
  424. pattern = evaluatedPattern.regex.pattern;
  425. checkedPatternNodes.add(patternNode);
  426. } else {
  427. pattern = String(evaluatedPattern.value);
  428. }
  429. flags = getStringIfConstant(flagsNode, scope);
  430. } else {
  431. if (evaluatedPattern.regex) {
  432. continue;
  433. }
  434. pattern = String(evaluatedPattern.value);
  435. flags = "";
  436. }
  437. if (typeof flags === "string") {
  438. verify(patternNode, pattern, flags, fixer => {
  439. if (!isValidWithUnicodeFlag(context.languageOptions.ecmaVersion, pattern)) {
  440. return null;
  441. }
  442. if (refNode.arguments.length === 1) {
  443. const penultimateToken = sourceCode.getLastToken(refNode, { skip: 1 }); // skip closing parenthesis
  444. return fixer.insertTextAfter(
  445. penultimateToken,
  446. astUtils.isCommaToken(penultimateToken)
  447. ? ' "u",'
  448. : ', "u"'
  449. );
  450. }
  451. if ((flagsNode.type === "Literal" && typeof flagsNode.value === "string") || flagsNode.type === "TemplateLiteral") {
  452. const range = [flagsNode.range[0], flagsNode.range[1] - 1];
  453. return fixer.insertTextAfterRange(range, "u");
  454. }
  455. return null;
  456. });
  457. }
  458. }
  459. }
  460. };
  461. }
  462. };