rewrite-pattern.js 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913
  1. 'use strict';
  2. const generate = require('regjsgen').generate;
  3. const parse = require('regjsparser').parse;
  4. const regenerate = require('regenerate');
  5. const unicodeMatchProperty = require('unicode-match-property-ecmascript');
  6. const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript');
  7. const iuMappings = require('./data/iu-mappings.js');
  8. const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
  9. function flatMap(array, callback) {
  10. const result = [];
  11. array.forEach(item => {
  12. const res = callback(item);
  13. if (Array.isArray(res)) {
  14. result.push.apply(result, res);
  15. } else {
  16. result.push(res);
  17. }
  18. });
  19. return result;
  20. }
  21. function regenerateContainsAstral(regenerateData) {
  22. const data = regenerateData.data;
  23. return data.length >= 1 && data[data.length - 1] >= 0x10000;
  24. }
  25. const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;
  26. // Prepare a Regenerate set containing all code points, used for negative
  27. // character classes (if any).
  28. const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
  29. const ASTRAL_SET = regenerate().addRange(0x10000, 0x10FFFF);
  30. const NEWLINE_SET = regenerate().add(
  31. // `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
  32. 0x000A, // Line Feed <LF>
  33. 0x000D, // Carriage Return <CR>
  34. 0x2028, // Line Separator <LS>
  35. 0x2029 // Paragraph Separator <PS>
  36. );
  37. // Prepare a Regenerate set containing all code points that are supposed to be
  38. // matched by `/./u`. https://mths.be/es6#sec-atom
  39. const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
  40. .remove(NEWLINE_SET);
  41. const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => {
  42. if (unicode) {
  43. if (ignoreCase) {
  44. return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character);
  45. }
  46. return ESCAPE_SETS.UNICODE.get(character);
  47. }
  48. return ESCAPE_SETS.REGULAR.get(character);
  49. };
  50. const getUnicodeDotSet = (dotAll) => {
  51. return dotAll ? UNICODE_SET : DOT_SET_UNICODE;
  52. };
  53. const getUnicodePropertyValueSet = (property, value) => {
  54. const path = value ?
  55. `${ property }/${ value }` :
  56. `Binary_Property/${ property }`;
  57. try {
  58. return require(`regenerate-unicode-properties/${ path }.js`);
  59. } catch (exception) {
  60. throw new Error(
  61. `Failed to recognize value \`${ value }\` for property ` +
  62. `\`${ property }\`.`
  63. );
  64. }
  65. };
  66. const handleLoneUnicodePropertyNameOrValue = (value) => {
  67. // It could be a `General_Category` value or a binary property.
  68. // Note: `unicodeMatchPropertyValue` throws on invalid values.
  69. try {
  70. const property = 'General_Category';
  71. const category = unicodeMatchPropertyValue(property, value);
  72. return getUnicodePropertyValueSet(property, category);
  73. } catch (exception) {}
  74. // It’s not a `General_Category` value, so check if it’s a property
  75. // of strings.
  76. try {
  77. return getUnicodePropertyValueSet('Property_of_Strings', value);
  78. } catch (exception) {}
  79. // Lastly, check if it’s a binary property of single code points.
  80. // Note: `unicodeMatchProperty` throws on invalid properties.
  81. const property = unicodeMatchProperty(value);
  82. return getUnicodePropertyValueSet(property);
  83. };
  84. const getUnicodePropertyEscapeSet = (value, isNegative) => {
  85. const parts = value.split('=');
  86. const firstPart = parts[0];
  87. let set;
  88. if (parts.length == 1) {
  89. set = handleLoneUnicodePropertyNameOrValue(firstPart);
  90. } else {
  91. // The pattern consists of two parts, i.e. `Property=Value`.
  92. const property = unicodeMatchProperty(firstPart);
  93. const value = unicodeMatchPropertyValue(property, parts[1]);
  94. set = getUnicodePropertyValueSet(property, value);
  95. }
  96. if (isNegative) {
  97. if (set.strings) {
  98. throw new Error('Cannot negate Unicode property of strings');
  99. }
  100. return {
  101. characters: UNICODE_SET.clone().remove(set.characters),
  102. strings: new Set()
  103. };
  104. }
  105. return {
  106. characters: set.characters.clone(),
  107. strings: set.strings
  108. // We need to escape strings like *️⃣ to make sure that they can be safely used in unions.
  109. ? new Set(set.strings.map(str => str.replace(SPECIAL_CHARS, '\\$1')))
  110. : new Set()
  111. };
  112. };
  113. const getUnicodePropertyEscapeCharacterClassData = (property, isNegative) => {
  114. const set = getUnicodePropertyEscapeSet(property, isNegative);
  115. const data = getCharacterClassEmptyData();
  116. data.singleChars = set.characters;
  117. if (set.strings.size > 0) {
  118. data.longStrings = set.strings;
  119. data.maybeIncludesStrings = true;
  120. }
  121. return data;
  122. };
  123. function configNeedCaseFoldAscii() {
  124. return !!config.modifiersData.i;
  125. }
  126. function configNeedCaseFoldUnicode() {
  127. // config.modifiersData.i : undefined | false
  128. if (config.modifiersData.i === false) return false;
  129. if (!config.transform.unicodeFlag) return false;
  130. return Boolean(config.modifiersData.i || config.flags.ignoreCase);
  131. }
  132. // Given a range of code points, add any case-folded code points in that range
  133. // to a set.
  134. regenerate.prototype.iuAddRange = function(min, max) {
  135. const $this = this;
  136. do {
  137. const folded = caseFold(min, configNeedCaseFoldAscii(), configNeedCaseFoldUnicode());
  138. if (folded) {
  139. $this.add(folded);
  140. }
  141. } while (++min <= max);
  142. return $this;
  143. };
  144. regenerate.prototype.iuRemoveRange = function(min, max) {
  145. const $this = this;
  146. do {
  147. const folded = caseFold(min, configNeedCaseFoldAscii(), configNeedCaseFoldUnicode());
  148. if (folded) {
  149. $this.remove(folded);
  150. }
  151. } while (++min <= max);
  152. return $this;
  153. };
  154. const update = (item, pattern) => {
  155. let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '', {
  156. lookbehind: true,
  157. namedGroups: true,
  158. unicodePropertyEscape: true,
  159. unicodeSet: true,
  160. modifiers: true,
  161. });
  162. switch (tree.type) {
  163. case 'characterClass':
  164. case 'group':
  165. case 'value':
  166. // No wrapping needed.
  167. break;
  168. default:
  169. // Wrap the pattern in a non-capturing group.
  170. tree = wrap(tree, pattern);
  171. }
  172. Object.assign(item, tree);
  173. };
  174. const wrap = (tree, pattern) => {
  175. // Wrap the pattern in a non-capturing group.
  176. return {
  177. 'type': 'group',
  178. 'behavior': 'ignore',
  179. 'body': [tree],
  180. 'raw': `(?:${ pattern })`
  181. };
  182. };
  183. const caseFold = (codePoint, includeAscii, includeUnicode) => {
  184. let folded = (includeUnicode ? iuMappings.get(codePoint) : undefined) || [];
  185. if (typeof folded === 'number') folded = [folded];
  186. if (includeAscii) {
  187. if (codePoint >= 0x41 && codePoint <= 0x5A) {
  188. folded.push(codePoint + 0x20);
  189. } else if (codePoint >= 0x61 && codePoint <= 0x7A) {
  190. folded.push(codePoint - 0x20);
  191. }
  192. }
  193. return folded.length == 0 ? false : folded;
  194. };
  195. const buildHandler = (action) => {
  196. switch (action) {
  197. case 'union':
  198. return {
  199. single: (data, cp) => {
  200. data.singleChars.add(cp);
  201. },
  202. regSet: (data, set2) => {
  203. data.singleChars.add(set2);
  204. },
  205. range: (data, start, end) => {
  206. data.singleChars.addRange(start, end);
  207. },
  208. iuRange: (data, start, end) => {
  209. data.singleChars.iuAddRange(start, end);
  210. },
  211. nested: (data, nestedData) => {
  212. data.singleChars.add(nestedData.singleChars);
  213. for (const str of nestedData.longStrings) data.longStrings.add(str);
  214. if (nestedData.maybeIncludesStrings) data.maybeIncludesStrings = true;
  215. }
  216. };
  217. case 'union-negative': {
  218. const regSet = (data, set2) => {
  219. data.singleChars = UNICODE_SET.clone().remove(set2).add(data.singleChars);
  220. };
  221. return {
  222. single: (data, cp) => {
  223. const unicode = UNICODE_SET.clone();
  224. data.singleChars = data.singleChars.contains(cp) ? unicode : unicode.remove(cp);
  225. },
  226. regSet: regSet,
  227. range: (data, start, end) => {
  228. data.singleChars = UNICODE_SET.clone().removeRange(start, end).add(data.singleChars);
  229. },
  230. iuRange: (data, start, end) => {
  231. data.singleChars = UNICODE_SET.clone().iuRemoveRange(start, end).add(data.singleChars);
  232. },
  233. nested: (data, nestedData) => {
  234. regSet(data, nestedData.singleChars);
  235. if (nestedData.maybeIncludesStrings) throw new Error('ASSERTION ERROR');
  236. }
  237. };
  238. }
  239. case 'intersection': {
  240. const regSet = (data, set2) => {
  241. if (data.first) data.singleChars = set2;
  242. else data.singleChars.intersection(set2);
  243. };
  244. return {
  245. single: (data, cp) => {
  246. data.singleChars = data.first || data.singleChars.contains(cp) ? regenerate(cp) : regenerate();
  247. data.longStrings.clear();
  248. data.maybeIncludesStrings = false;
  249. },
  250. regSet: (data, set) => {
  251. regSet(data, set);
  252. data.longStrings.clear();
  253. data.maybeIncludesStrings = false;
  254. },
  255. range: (data, start, end) => {
  256. if (data.first) data.singleChars.addRange(start, end);
  257. else data.singleChars.intersection(regenerate().addRange(start, end));
  258. data.longStrings.clear();
  259. data.maybeIncludesStrings = false;
  260. },
  261. iuRange: (data, start, end) => {
  262. if (data.first) data.singleChars.iuAddRange(start, end);
  263. else data.singleChars.intersection(regenerate().iuAddRange(start, end));
  264. data.longStrings.clear();
  265. data.maybeIncludesStrings = false;
  266. },
  267. nested: (data, nestedData) => {
  268. regSet(data, nestedData.singleChars);
  269. if (data.first) {
  270. data.longStrings = nestedData.longStrings;
  271. data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
  272. } else {
  273. for (const str of data.longStrings) {
  274. if (!nestedData.longStrings.has(str)) data.longStrings.delete(str);
  275. }
  276. if (!nestedData.maybeIncludesStrings) data.maybeIncludesStrings = false;
  277. }
  278. }
  279. };
  280. }
  281. case 'subtraction': {
  282. const regSet = (data, set2) => {
  283. if (data.first) data.singleChars.add(set2);
  284. else data.singleChars.remove(set2);
  285. };
  286. return {
  287. single: (data, cp) => {
  288. if (data.first) data.singleChars.add(cp);
  289. else data.singleChars.remove(cp);
  290. },
  291. regSet: regSet,
  292. range: (data, start, end) => {
  293. if (data.first) data.singleChars.addRange(start, end);
  294. else data.singleChars.removeRange(start, end);
  295. },
  296. iuRange: (data, start, end) => {
  297. if (data.first) data.singleChars.iuAddRange(start, end);
  298. else data.singleChars.iuRemoveRange(start, end);
  299. },
  300. nested: (data, nestedData) => {
  301. regSet(data, nestedData.singleChars);
  302. if (data.first) {
  303. data.longStrings = nestedData.longStrings;
  304. data.maybeIncludesStrings = nestedData.maybeIncludesStrings;
  305. } else {
  306. for (const str of data.longStrings) {
  307. if (nestedData.longStrings.has(str)) data.longStrings.delete(str);
  308. }
  309. }
  310. }
  311. };
  312. }
  313. // The `default` clause is only here as a safeguard; it should never be
  314. // reached. Code coverage tools should ignore it.
  315. /* node:coverage ignore next */
  316. default:
  317. throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
  318. }
  319. };
  320. const getCharacterClassEmptyData = () => ({
  321. transformed: config.transform.unicodeFlag,
  322. singleChars: regenerate(),
  323. longStrings: new Set(),
  324. hasEmptyString: false,
  325. first: true,
  326. maybeIncludesStrings: false
  327. });
  328. const maybeFold = (codePoint) => {
  329. const caseFoldAscii = configNeedCaseFoldAscii();
  330. const caseFoldUnicode = configNeedCaseFoldUnicode();
  331. if (caseFoldAscii || caseFoldUnicode) {
  332. const folded = caseFold(codePoint, caseFoldAscii, caseFoldUnicode);
  333. if (folded) {
  334. return [codePoint, folded];
  335. }
  336. }
  337. return [codePoint];
  338. };
  339. const computeClassStrings = (classStrings, regenerateOptions) => {
  340. let data = getCharacterClassEmptyData();
  341. const caseFoldAscii = configNeedCaseFoldAscii();
  342. const caseFoldUnicode = configNeedCaseFoldUnicode();
  343. for (const string of classStrings.strings) {
  344. if (string.characters.length === 1) {
  345. maybeFold(string.characters[0].codePoint).forEach((cp) => {
  346. data.singleChars.add(cp);
  347. });
  348. } else {
  349. let stringifiedString;
  350. if (caseFoldUnicode || caseFoldAscii) {
  351. stringifiedString = '';
  352. for (const ch of string.characters) {
  353. let set = regenerate(ch.codePoint);
  354. const folded = maybeFold(ch.codePoint);
  355. if (folded) set.add(folded);
  356. stringifiedString += set.toString(regenerateOptions);
  357. }
  358. } else {
  359. stringifiedString = string.characters.map(ch => generate(ch)).join('')
  360. }
  361. data.longStrings.add(stringifiedString);
  362. data.maybeIncludesStrings = true;
  363. }
  364. }
  365. return data;
  366. }
  367. const computeCharacterClass = (characterClassItem, regenerateOptions) => {
  368. let data = getCharacterClassEmptyData();
  369. let handlePositive;
  370. let handleNegative;
  371. switch (characterClassItem.kind) {
  372. case 'union':
  373. handlePositive = buildHandler('union');
  374. handleNegative = buildHandler('union-negative');
  375. break;
  376. case 'intersection':
  377. handlePositive = buildHandler('intersection');
  378. handleNegative = buildHandler('subtraction');
  379. if (config.transform.unicodeSetsFlag) data.transformed = true;
  380. break;
  381. case 'subtraction':
  382. handlePositive = buildHandler('subtraction');
  383. handleNegative = buildHandler('intersection');
  384. if (config.transform.unicodeSetsFlag) data.transformed = true;
  385. break;
  386. // The `default` clause is only here as a safeguard; it should never be
  387. // reached. Code coverage tools should ignore it.
  388. /* node:coverage ignore next */
  389. default:
  390. throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
  391. }
  392. const caseFoldAscii = configNeedCaseFoldAscii();
  393. const caseFoldUnicode = configNeedCaseFoldUnicode();
  394. for (const item of characterClassItem.body) {
  395. switch (item.type) {
  396. case 'value':
  397. maybeFold(item.codePoint).forEach((cp) => {
  398. handlePositive.single(data, cp);
  399. });
  400. break;
  401. case 'characterClassRange':
  402. const min = item.min.codePoint;
  403. const max = item.max.codePoint;
  404. handlePositive.range(data, min, max);
  405. if (caseFoldAscii || caseFoldUnicode) {
  406. handlePositive.iuRange(data, min, max);
  407. data.transformed = true;
  408. }
  409. break;
  410. case 'characterClassEscape':
  411. handlePositive.regSet(data, getCharacterClassEscapeSet(
  412. item.value,
  413. config.flags.unicode || config.flags.unicodeSets,
  414. config.flags.ignoreCase
  415. ));
  416. break;
  417. case 'unicodePropertyEscape':
  418. const nestedData = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative);
  419. handlePositive.nested(data, nestedData);
  420. data.transformed =
  421. data.transformed ||
  422. config.transform.unicodePropertyEscapes ||
  423. (config.transform.unicodeSetsFlag && (nestedData.maybeIncludesStrings || characterClassItem.kind !== "union"));
  424. break;
  425. case 'characterClass':
  426. const handler = item.negative ? handleNegative : handlePositive;
  427. const res = computeCharacterClass(item, regenerateOptions);
  428. handler.nested(data, res);
  429. data.transformed = true;
  430. break;
  431. case 'classStrings':
  432. handlePositive.nested(data, computeClassStrings(item, regenerateOptions));
  433. data.transformed = true;
  434. break;
  435. // The `default` clause is only here as a safeguard; it should never be
  436. // reached. Code coverage tools should ignore it.
  437. /* node:coverage ignore next */
  438. default:
  439. throw new Error(`Unknown term type: ${ item.type }`);
  440. }
  441. data.first = false;
  442. }
  443. if (characterClassItem.negative && data.maybeIncludesStrings) {
  444. throw new SyntaxError('Cannot negate set containing strings');
  445. }
  446. return data;
  447. }
  448. const processCharacterClass = (
  449. characterClassItem,
  450. regenerateOptions,
  451. computed = computeCharacterClass(characterClassItem, regenerateOptions)
  452. ) => {
  453. const negative = characterClassItem.negative;
  454. const { singleChars, transformed, longStrings } = computed;
  455. if (transformed) {
  456. // If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
  457. const bmpOnly = regenerateContainsAstral(singleChars);
  458. const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));
  459. if (negative) {
  460. if (config.useUnicodeFlag) {
  461. update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
  462. } else {
  463. if (config.flags.unicode || config.flags.unicodeSets) {
  464. if (config.flags.ignoreCase) {
  465. const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
  466. // Assumption: singleChars do not contain lone surrogates.
  467. // Regex like /[^\ud800]/u is not supported
  468. const surrogateOrBMPSetStr = singleChars
  469. .clone()
  470. .remove(astralCharsSet)
  471. .addRange(0xd800, 0xdfff)
  472. .toString({ bmpOnly: true });
  473. // Don't generate negative lookahead for astral characters
  474. // because the case folding is not working anyway as we break
  475. // code points into surrogate pairs.
  476. const astralNegativeSetStr = ASTRAL_SET
  477. .clone()
  478. .remove(astralCharsSet)
  479. .toString(regenerateOptions);
  480. // The transform here does not support lone surrogates.
  481. update(
  482. characterClassItem,
  483. `(?!${surrogateOrBMPSetStr})[^]|${astralNegativeSetStr}`
  484. );
  485. } else {
  486. // Generate negative set directly when case folding is not involved.
  487. const negativeSet = UNICODE_SET.clone().remove(singleChars);
  488. update(characterClassItem, negativeSet.toString(regenerateOptions));
  489. }
  490. } else {
  491. update(characterClassItem, `(?!${setStr})[^]`);
  492. }
  493. }
  494. } else {
  495. const hasEmptyString = longStrings.has('');
  496. const pieces = Array.from(longStrings).sort((a, b) => b.length - a.length);
  497. if (setStr !== '[]' || longStrings.size === 0) {
  498. pieces.splice(pieces.length - (hasEmptyString ? 1 : 0), 0, setStr);
  499. }
  500. update(characterClassItem, pieces.join('|'));
  501. }
  502. }
  503. return characterClassItem;
  504. };
  505. const assertNoUnmatchedReferences = (groups) => {
  506. const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences);
  507. if (unmatchedReferencesNames.length > 0) {
  508. throw new Error(`Unknown group names: ${unmatchedReferencesNames}`);
  509. }
  510. };
  511. const processModifiers = (item, regenerateOptions, groups) => {
  512. const enabling = item.modifierFlags.enabling;
  513. const disabling = item.modifierFlags.disabling;
  514. delete item.modifierFlags;
  515. item.behavior = 'ignore';
  516. const oldData = Object.assign({}, config.modifiersData);
  517. enabling.split('').forEach(flag => {
  518. config.modifiersData[flag] = true;
  519. });
  520. disabling.split('').forEach(flag => {
  521. config.modifiersData[flag] = false;
  522. });
  523. item.body = item.body.map(term => {
  524. return processTerm(term, regenerateOptions, groups);
  525. });
  526. config.modifiersData = oldData;
  527. return item;
  528. }
  529. const processTerm = (item, regenerateOptions, groups) => {
  530. switch (item.type) {
  531. case 'dot':
  532. if (config.transform.unicodeFlag) {
  533. update(
  534. item,
  535. getUnicodeDotSet(config.flags.dotAll || config.modifiersData.s).toString(regenerateOptions)
  536. );
  537. } else if (config.transform.dotAllFlag || config.modifiersData.s) {
  538. // TODO: consider changing this at the regenerate level.
  539. update(item, '[^]');
  540. }
  541. break;
  542. case 'characterClass':
  543. item = processCharacterClass(item, regenerateOptions);
  544. break;
  545. case 'unicodePropertyEscape':
  546. const data = getUnicodePropertyEscapeCharacterClassData(item.value, item.negative);
  547. if (data.maybeIncludesStrings) {
  548. if (!config.flags.unicodeSets) {
  549. throw new Error(
  550. 'Properties of strings are only supported when using the unicodeSets (v) flag.'
  551. );
  552. }
  553. if (config.transform.unicodeSetsFlag) {
  554. data.transformed = true;
  555. item = processCharacterClass(item, regenerateOptions, data);
  556. }
  557. } else if (config.transform.unicodePropertyEscapes) {
  558. update(
  559. item,
  560. data.singleChars.toString(regenerateOptions)
  561. );
  562. }
  563. break;
  564. case 'characterClassEscape':
  565. if (config.transform.unicodeFlag) {
  566. update(
  567. item,
  568. getCharacterClassEscapeSet(
  569. item.value,
  570. /* config.transform.unicodeFlag implies config.flags.unicode */ true,
  571. config.flags.ignoreCase
  572. ).toString(regenerateOptions)
  573. );
  574. }
  575. break;
  576. case 'group':
  577. if (item.behavior == 'normal') {
  578. groups.lastIndex++;
  579. }
  580. if (item.name) {
  581. const name = item.name.value;
  582. if (groups.namesConflicts[name]) {
  583. throw new Error(
  584. `Group '${ name }' has already been defined in this context.`
  585. );
  586. }
  587. groups.namesConflicts[name] = true;
  588. if (config.transform.namedGroups) {
  589. delete item.name;
  590. }
  591. const index = groups.lastIndex;
  592. if (!groups.names[name]) {
  593. groups.names[name] = [];
  594. }
  595. groups.names[name].push(index);
  596. if (groups.onNamedGroup) {
  597. groups.onNamedGroup.call(null, name, index);
  598. }
  599. if (groups.unmatchedReferences[name]) {
  600. delete groups.unmatchedReferences[name];
  601. }
  602. }
  603. if (item.modifierFlags && config.transform.modifiers) {
  604. return processModifiers(item, regenerateOptions, groups);
  605. }
  606. /* falls through */
  607. case 'quantifier':
  608. item.body = item.body.map(term => {
  609. return processTerm(term, regenerateOptions, groups);
  610. });
  611. break;
  612. case 'disjunction':
  613. const outerNamesConflicts = groups.namesConflicts;
  614. item.body = item.body.map(term => {
  615. groups.namesConflicts = Object.create(outerNamesConflicts);
  616. return processTerm(term, regenerateOptions, groups);
  617. });
  618. break;
  619. case 'alternative':
  620. item.body = flatMap(item.body, term => {
  621. const res = processTerm(term, regenerateOptions, groups);
  622. // Alternatives cannot contain alternatives; flatten them.
  623. return res.type === 'alternative' ? res.body : res;
  624. });
  625. break;
  626. case 'value':
  627. const codePoint = item.codePoint;
  628. const set = regenerate(codePoint);
  629. const folded = maybeFold(codePoint);
  630. if (folded.length === 1 && item.kind === "symbol" && folded[0] >= 0x20 && folded[0] <= 0x7E) {
  631. // skip regenerate when it is a printable ASCII symbol
  632. break;
  633. }
  634. set.add(folded);
  635. update(item, set.toString(regenerateOptions));
  636. break;
  637. case 'reference':
  638. if (item.name) {
  639. const name = item.name.value;
  640. const indexes = groups.names[name];
  641. if (!indexes) {
  642. groups.unmatchedReferences[name] = true;
  643. }
  644. if (config.transform.namedGroups) {
  645. if (indexes) {
  646. const body = indexes.map(index => ({
  647. 'type': 'reference',
  648. 'matchIndex': index,
  649. 'raw': '\\' + index,
  650. }));
  651. if (body.length === 1) {
  652. return body[0];
  653. }
  654. return {
  655. 'type': 'alternative',
  656. 'body': body,
  657. 'raw': body.map(term => term.raw).join(''),
  658. };
  659. }
  660. // This named reference comes before the group where it’s defined,
  661. // so it’s always an empty match.
  662. return {
  663. 'type': 'group',
  664. 'behavior': 'ignore',
  665. 'body': [],
  666. 'raw': '(?:)',
  667. };
  668. }
  669. }
  670. break;
  671. case 'anchor':
  672. if (config.modifiersData.m) {
  673. if (item.kind == 'start') {
  674. update(item, `(?:^|(?<=${NEWLINE_SET.toString()}))`);
  675. } else if (item.kind == 'end') {
  676. update(item, `(?:$|(?=${NEWLINE_SET.toString()}))`);
  677. }
  678. }
  679. case 'empty':
  680. // Nothing to do here.
  681. break;
  682. // The `default` clause is only here as a safeguard; it should never be
  683. // reached. Code coverage tools should ignore it.
  684. /* node:coverage ignore next */
  685. default:
  686. throw new Error(`Unknown term type: ${ item.type }`);
  687. }
  688. return item;
  689. };
  690. const config = {
  691. 'flags': {
  692. 'ignoreCase': false,
  693. 'unicode': false,
  694. 'unicodeSets': false,
  695. 'dotAll': false,
  696. 'multiline': false,
  697. },
  698. 'transform': {
  699. 'dotAllFlag': false,
  700. 'unicodeFlag': false,
  701. 'unicodeSetsFlag': false,
  702. 'unicodePropertyEscapes': false,
  703. 'namedGroups': false,
  704. 'modifiers': false,
  705. },
  706. 'modifiersData': {
  707. 'i': undefined,
  708. 's': undefined,
  709. 'm': undefined,
  710. },
  711. get useUnicodeFlag() {
  712. return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag;
  713. }
  714. };
  715. const validateOptions = (options) => {
  716. if (!options) return;
  717. for (const key of Object.keys(options)) {
  718. const value = options[key];
  719. switch (key) {
  720. case 'dotAllFlag':
  721. case 'unicodeFlag':
  722. case 'unicodePropertyEscapes':
  723. case 'unicodeSetsFlag':
  724. case 'namedGroups':
  725. if (value != null && value !== false && value !== 'transform') {
  726. throw new Error(`.${key} must be false (default) or 'transform'.`);
  727. }
  728. break;
  729. case 'modifiers':
  730. if (value != null && value !== false && value !== 'parse' && value !== 'transform') {
  731. throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`);
  732. }
  733. break;
  734. case 'onNamedGroup':
  735. case 'onNewFlags':
  736. if (value != null && typeof value !== 'function') {
  737. throw new Error(`.${key} must be a function.`);
  738. }
  739. break;
  740. default:
  741. throw new Error(`.${key} is not a valid regexpu-core option.`);
  742. }
  743. }
  744. };
  745. const hasFlag = (flags, flag) => flags ? flags.includes(flag) : false;
  746. const transform = (options, name) => options ? options[name] === 'transform' : false;
  747. const rewritePattern = (pattern, flags, options) => {
  748. validateOptions(options);
  749. config.flags.unicode = hasFlag(flags, 'u');
  750. config.flags.unicodeSets = hasFlag(flags, 'v');
  751. config.flags.ignoreCase = hasFlag(flags, 'i');
  752. config.flags.dotAll = hasFlag(flags, 's');
  753. config.flags.multiline = hasFlag(flags, 'm');
  754. config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag');
  755. config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag');
  756. config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag');
  757. // unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform'
  758. config.transform.unicodePropertyEscapes = (config.flags.unicode || config.flags.unicodeSets) && (
  759. transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes')
  760. );
  761. config.transform.namedGroups = transform(options, 'namedGroups');
  762. config.transform.modifiers = transform(options, 'modifiers');
  763. config.modifiersData.i = undefined;
  764. config.modifiersData.s = undefined;
  765. config.modifiersData.m = undefined;
  766. const regjsparserFeatures = {
  767. 'modifiers': Boolean(options && options.modifiers),
  768. // Enable every stable RegExp feature by default
  769. 'unicodePropertyEscape': true,
  770. 'unicodeSet': true,
  771. 'namedGroups': true,
  772. 'lookbehind': true,
  773. };
  774. const regenerateOptions = {
  775. 'hasUnicodeFlag': config.useUnicodeFlag,
  776. 'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
  777. };
  778. const groups = {
  779. 'onNamedGroup': options && options.onNamedGroup,
  780. 'lastIndex': 0,
  781. 'names': Object.create(null), // { [name]: Array<index> }
  782. 'namesConflicts': Object.create(null), // { [name]: true }
  783. 'unmatchedReferences': Object.create(null) // { [name]: true }
  784. };
  785. const tree = parse(pattern, flags, regjsparserFeatures);
  786. if (config.transform.modifiers) {
  787. if (/\(\?[a-z]*-[a-z]+:/.test(pattern)) {
  788. // the pattern _likely_ contain inline disabled modifiers
  789. // we need to traverse to make sure that they are actually modifiers and to collect them
  790. const allDisabledModifiers = Object.create(null)
  791. const itemStack = [tree];
  792. let node;
  793. while (node = itemStack.pop(), node != undefined) {
  794. if (Array.isArray(node)) {
  795. Array.prototype.push.apply(itemStack, node);
  796. } else if (typeof node == 'object' && node != null) {
  797. for (const key of Object.keys(node)) {
  798. const value = node[key];
  799. if (key == 'modifierFlags') {
  800. if (value.disabling.length > 0){
  801. value.disabling.split('').forEach((flag)=>{
  802. allDisabledModifiers[flag] = true
  803. });
  804. }
  805. } else if (typeof value == 'object' && value != null) {
  806. itemStack.push(value);
  807. }
  808. }
  809. }
  810. }
  811. for (const flag of Object.keys(allDisabledModifiers)) {
  812. config.modifiersData[flag] = true;
  813. }
  814. }
  815. }
  816. // Note: `processTerm` mutates `tree` and `groups`.
  817. processTerm(tree, regenerateOptions, groups);
  818. assertNoUnmatchedReferences(groups);
  819. const onNewFlags = options && options.onNewFlags;
  820. if (onNewFlags) {
  821. let newFlags = flags.split('').filter((flag) => !config.modifiersData[flag]).join('');
  822. if (config.transform.unicodeSetsFlag) {
  823. newFlags = newFlags.replace('v', 'u');
  824. }
  825. if (config.transform.unicodeFlag) {
  826. newFlags = newFlags.replace('u', '');
  827. }
  828. if (config.transform.dotAllFlag === 'transform') {
  829. newFlags = newFlags.replace('s', '');
  830. }
  831. onNewFlags(newFlags);
  832. }
  833. return generate(tree);
  834. };
  835. module.exports = rewritePattern;