src/utils-wikitext.js

  1. /**
  2. * Wikitext parsing and processing utilities.
  3. *
  4. * @module utilsWikitext
  5. */
  6. import TextMasker from './TextMasker';
  7. import cd from './cd';
  8. import { decodeHtmlEntities, generatePageNamePattern, removeDirMarks } from './utils-general';
  9. import { parseTimestamp } from './utils-timestamp';
  10. /**
  11. * Generate a regular expression that searches for specified tags in the text (opening, closing, and
  12. * content between them).
  13. *
  14. * @param {string[]} tags
  15. * @returns {RegExp}
  16. */
  17. export function generateTagsRegexp(tags) {
  18. const tagsJoined = tags.join('|');
  19. return new RegExp(`(<(${tagsJoined})(?: [\\w ]+(?:=[^<>]+?)?| *)>)([^]*?)(</\\2>)`, 'ig');
  20. }
  21. /**
  22. * Replace HTML comments (`<!-- -->`), `<nowiki>`, `<syntaxhighlight>`, `<source>`, and `<pre>` tags
  23. * content, left-to-right and right-to-left marks, and also newlines inside some tags (`<br\n>`) in
  24. * the code with spaces.
  25. *
  26. * This is used to ignore comment contents (there could be section code examples for novices there
  27. * that could confuse search results) but get right positions and code in the result.
  28. *
  29. * @param {string} code
  30. * @returns {string}
  31. */
  32. export function maskDistractingCode(code) {
  33. return code
  34. .replace(
  35. generateTagsRegexp(['nowiki', 'syntaxhighlight', 'source', 'pre']),
  36. (s, before, tagName, content, after) => before + ' '.repeat(content.length) + after
  37. )
  38. .replace(/<!--([^]*?)-->/g, (s, content) => '\x01' + ' '.repeat(content.length + 5) + '\x02')
  39. .replace(/[\u200e\u200f]/g, () => ' ')
  40. .replace(
  41. /(<\/?(?:br|p)\b.*)(\n+)(>)/g,
  42. (s, before, newline, after) => before + ' '.repeat(newline.length) + after
  43. );
  44. }
  45. /**
  46. * Find the first timestamp related to a comment in the code.
  47. *
  48. * @param {string} code
  49. * @returns {?string}
  50. */
  51. export function findFirstTimestamp(code) {
  52. return extractSignatures(code)[0]?.timestamp || null;
  53. }
  54. /**
  55. * Remove certain kinds of wiki markup from code, such as formatting, links, tags, and comments.
  56. * Also replace multiple spaces with one and trim the input. The product of this function is usually
  57. * not for display (for example, it just removes template names making the resulting code look
  58. * silly), but for comparing purposes.
  59. *
  60. * @param {string} code
  61. * @returns {string}
  62. */
  63. export function removeWikiMarkup(code) {
  64. // Ideally, only text from images in the "thumb" format should be captured, because in the
  65. // standard format the text is not displayed. See img_thumbnail in
  66. // https://ru.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=magicwords&formatversion=2.
  67. // Unfortunately, that would add like 100ms to the server's response time. So, we use it if it is
  68. // present in the config file.
  69. const fileEmbedRegexp = new RegExp(
  70. `\\[\\[${cd.g.filePrefixPattern}[^\\]]+?(?:\\|[^\\]]+?\\| *((?:\\[\\[[^\\]]+?\\]\\]|[^|\\]])+))? *\\]\\]`,
  71. 'ig'
  72. );
  73. return code
  74. // Remove comments
  75. .replace(/<!--[^]*?-->/g, '')
  76. // Remove text hidden by the script (for example, in wikitext.maskDistractingCode)
  77. .replace(/\x01 *\x02/g, '')
  78. // Pipe trick
  79. .replace(cd.g.pipeTrickRegexp, '$1$2$3')
  80. // Extract displayed text from file embeddings
  81. .replace(fileEmbedRegexp, (s, m) => cd.g.isThumbRegexp.test(s) ? m : '')
  82. // Extract displayed text from [[wikilinks]]
  83. .replace(/\[\[:?(?:[^|[\]<>\n]+\|)?(.+?)\]\]/g, '$1')
  84. // For optimization purposes, remove template names
  85. .replace(/\{\{:?(?:[^|{}<>\n]+)(?:\|(.+?))?\}\}/g, '$1')
  86. // Extract displayed text from [links]
  87. .replace(/\[https?:\/\/[^[\]<>"\n ]+ *([^\]]*)\]/g, '$1')
  88. // Remove bold
  89. .replace(/'''(.+?)'''/g, '$1')
  90. // Remove italics
  91. .replace(/''(.+?)''/g, '$1')
  92. // Replace <br> with a space
  93. .replace(/<br ?\/?>/g, ' ')
  94. // Remove opening and self-closing tags (won't work with <smth param=">">, but the native parser
  95. // fails too).
  96. .replace(/<\w+(?: [\w ]+(?:=[^<>]+?)?| *\/?)>/g, '')
  97. // Remove closing tags
  98. .replace(/<\/\w+(?: [\w ]+)? *>/g, '')
  99. // Replace multiple spaces with one space
  100. .replace(/ {2,}/g, ' ')
  101. .trim();
  102. }
  103. /**
  104. * Replace HTML entities with corresponding characters. Also replace different kinds of spaces,
  105. * including multiple, with one normal space.
  106. *
  107. * @param {string} text
  108. * @returns {string}
  109. */
  110. export function normalizeCode(text) {
  111. return decodeHtmlEntities(text).replace(/\s+/g, ' ').trim();
  112. }
  113. /**
  114. * Encode text to put it in a `[[wikilink]]`. This is meant for section links as the characters that
  115. * this function encodes are forbidden in page titles anyway, so page titles containing them are not
  116. * valid titles.
  117. *
  118. * @param {string} link
  119. * @returns {string}
  120. */
  121. export function encodeWikilink(link) {
  122. return link
  123. .replace(/<(\w+(?: [\w ]+(?:=[^<>]+?)?| *\/?)|\/\w+(?: [\w ]+)? *)>/g, '%3C$1%3E')
  124. .replace(/\[/g, '%5B')
  125. .replace(/\]/g, '%5D')
  126. .replace(/\{/g, '%7B')
  127. .replace(/\|/g, '%7C')
  128. .replace(/\}/g, '%7D')
  129. .replace(/\s+/g, ' ');
  130. }
  131. /**
  132. * Extract signatures from wikitext.
  133. *
  134. * Only basic signature parsing is performed here; more precise signature text identification is
  135. * performed in `CommentSource#adjustSignature`. See also `CommentSource#adjust`.
  136. *
  137. * @param {string} code Code to extract signatures from.
  138. * @returns {object[]}
  139. */
  140. export function extractSignatures(code) {
  141. // TODO: Instead of removing only lines containing antipatterns from wikitext, hide entire
  142. // templates and tags?
  143. // But keep in mind that this code may still be part of comments.
  144. const noSignatureClassesPattern = cd.g.noSignatureClasses.join('\\b|\\b');
  145. const commentAntipatternsPatternParts = [
  146. `class=(['"])[^'"\\n]*(?:\\b${noSignatureClassesPattern}\\b)[^'"\\n]*\\1`
  147. ];
  148. if (cd.config.noSignatureTemplates.length) {
  149. const pattern = cd.config.noSignatureTemplates.map(generatePageNamePattern).join('|');
  150. commentAntipatternsPatternParts.push(`\\{\\{ *(?:${pattern}) *(?:\\||\\}\\})`);
  151. }
  152. commentAntipatternsPatternParts.push(
  153. ...cd.config.commentAntipatterns.map((regexp) => regexp.source)
  154. );
  155. const commentAntipatternsPattern = commentAntipatternsPatternParts.join('|');
  156. const commentAntipatternsRegexp = new RegExp(`^.*(?:${commentAntipatternsPattern}).*$`, 'mg');
  157. // Hide HTML comments, quotes and lines containing antipatterns.
  158. const adjustedCode = maskDistractingCode(code)
  159. .replace(
  160. cd.g.quoteRegexp,
  161. (s, beginning, content, ending) => beginning + ' '.repeat(content.length) + ending
  162. )
  163. .replace(commentAntipatternsRegexp, (s) => ' '.repeat(s.length));
  164. let signatures = extractRegularSignatures(adjustedCode, code);
  165. const unsigneds = extractUnsigneds(adjustedCode, code, signatures);
  166. signatures.push(...unsigneds);
  167. // This is for the procedure adding anchors to comments linked from the comment, see
  168. // CommentForm#prepareNewPageCode.
  169. const signatureIndex = adjustedCode.indexOf(cd.g.signCode);
  170. if (signatureIndex !== -1) {
  171. signatures.push({
  172. author: cd.user.getName(),
  173. startIndex: signatureIndex,
  174. nextCommentStartIndex: signatureIndex + adjustedCode.slice(signatureIndex).indexOf('\n') + 1,
  175. });
  176. }
  177. if (unsigneds.length || signatureIndex !== -1) {
  178. signatures.sort((sig1, sig2) => sig1.startIndex > sig2.startIndex ? 1 : -1);
  179. }
  180. signatures = signatures.filter((sig) => sig.author);
  181. signatures.forEach((sig, i) => {
  182. sig.commentStartIndex = i === 0 ? 0 : signatures[i - 1].nextCommentStartIndex;
  183. });
  184. signatures.forEach((sig, i) => {
  185. const { date } = sig.timestamp && parseTimestamp(sig.timestamp) || {};
  186. sig.index = i;
  187. sig.date = date;
  188. delete sig.nextCommentStartIndex;
  189. });
  190. return signatures;
  191. }
  192. /**
  193. * Extract signatures that don't come from the unsigned templates from wikitext.
  194. *
  195. * @param {string} adjustedCode Adjusted page code.
  196. * @param {string} code Page code.
  197. * @returns {object[]}
  198. * @private
  199. */
  200. function extractRegularSignatures(adjustedCode, code) {
  201. const ending = `(?:\\n*|$)`;
  202. const afterTimestamp = `(?!["»])(?:\\}\\}|</small>)?`;
  203. // Use (?:^|[^=]) to filter out timestamps in a parameter (in quote templates)
  204. const timestampRegexp = new RegExp(
  205. `^((.*?(?:^|[^=]))(${cd.g.contentTimestampRegexp.source})${afterTimestamp}).*${ending}`,
  206. 'igm'
  207. );
  208. // After capturing the first signature with `.*?` we make another capture (with authorLinkRegexp)
  209. // to make sure we take the first link to the same author as the author in the last link. 251 is
  210. // not arbitrary: it's 255 (maximum allowed signature length) minus `'[[u:a'.length` plus
  211. // `' '.length` (the space before the timestamp).
  212. const signatureScanLimit = 251;
  213. const signatureRegexp = new RegExp(
  214. /*
  215. Captures:
  216. 1 - the whole line with the signature
  217. 2 - text before the timestamp
  218. 3 - text before the first user link
  219. 4 - author name (inside `cd.g.captureUserNamePattern`)
  220. 5 - sometimes, a slash appears here (inside `cd.g.captureUserNamePattern`)
  221. 6 - timestamp
  222. */
  223. (
  224. `^(((.*?)${cd.g.captureUserNamePattern}.{1,${signatureScanLimit - 1}}?[^=])` +
  225. `(${cd.g.contentTimestampRegexp.source})${afterTimestamp}.*)${ending}`
  226. ),
  227. 'im'
  228. );
  229. const lastAuthorLinkRegexp = new RegExp(`^.*${cd.g.captureUserNamePattern}`, 'i');
  230. const authorLinkRegexp = new RegExp(cd.g.captureUserNamePattern, 'ig');
  231. let signatures = [];
  232. let timestampMatch;
  233. while ((timestampMatch = timestampRegexp.exec(adjustedCode))) {
  234. const line = timestampMatch[0];
  235. const lineStartIndex = timestampMatch.index;
  236. const authorTimestampMatch = line.match(signatureRegexp);
  237. let author;
  238. let timestamp;
  239. let startIndex;
  240. let endIndex;
  241. let nextCommentStartIndex;
  242. let dirtyCode;
  243. if (authorTimestampMatch) {
  244. // Extract the timestamp data
  245. const timestampStartIndex = lineStartIndex + authorTimestampMatch[2].length;
  246. const timestampEndIndex = timestampStartIndex + authorTimestampMatch[6].length;
  247. timestamp = removeDirMarks(code.slice(timestampStartIndex, timestampEndIndex));
  248. // Extract the signature data
  249. startIndex = lineStartIndex + authorTimestampMatch[3].length;
  250. endIndex = lineStartIndex + authorTimestampMatch[1].length;
  251. dirtyCode = code.slice(startIndex, endIndex);
  252. nextCommentStartIndex = lineStartIndex + authorTimestampMatch[0].length;
  253. // Find the first link to this author in the preceding text.
  254. let authorLinkMatch;
  255. authorLinkRegexp.lastIndex = 0;
  256. const commentEndingStartIndex = Math.max(0, timestampStartIndex - lineStartIndex - 255);
  257. const commentEnding = authorTimestampMatch[0].slice(commentEndingStartIndex);
  258. const [, lastAuthorLink] = commentEnding.match(lastAuthorLinkRegexp) || [];
  259. // Locically it should always be non-empty. There is an unclear problem with
  260. // https://az.wikipedia.org/w/index.php?title=Vikipediya:Kənd_meydanı&diff=prev&oldid=7223881,
  261. // probably having something to do with difference between regular length and byte length.
  262. if (!lastAuthorLink) continue;
  263. // require() to avoid circular dependency
  264. const userRegistry = require('./userRegistry').default;
  265. author = userRegistry.get(decodeHtmlEntities(lastAuthorLink));
  266. // Rectify the author name if needed.
  267. while ((authorLinkMatch = authorLinkRegexp.exec(commentEnding))) {
  268. // Slash can be present in authorLinkMatch[2]. It often indicates a link to a page in the
  269. // author's userspace that is not part of the signature (while some such links are, and we
  270. // don't want to eliminate those cases).
  271. if (authorLinkMatch[2]) continue;
  272. const testAuthor = userRegistry.get(decodeHtmlEntities(authorLinkMatch[1]));
  273. if (testAuthor === author) {
  274. startIndex = lineStartIndex + commentEndingStartIndex + authorLinkMatch.index;
  275. dirtyCode = code.slice(startIndex, endIndex);
  276. break;
  277. }
  278. }
  279. } else {
  280. startIndex = lineStartIndex + timestampMatch[2].length;
  281. endIndex = lineStartIndex + timestampMatch[1].length;
  282. dirtyCode = code.slice(startIndex, endIndex);
  283. const timestampEndIndex = startIndex + timestampMatch[3].length;
  284. timestamp = removeDirMarks(code.slice(startIndex, timestampEndIndex));
  285. nextCommentStartIndex = lineStartIndex + timestampMatch[0].length;
  286. }
  287. signatures.push({ author, timestamp, startIndex, endIndex, dirtyCode, nextCommentStartIndex });
  288. }
  289. return signatures;
  290. }
  291. /**
  292. * Extract signatures that come from the unsigned templates from wikitext.
  293. *
  294. * @param {string} adjustedCode Adjusted page code.
  295. * @param {string} code Page code.
  296. * @param {object[]} signatures Existing signatures.
  297. * @returns {object[]}
  298. * @private
  299. */
  300. function extractUnsigneds(adjustedCode, code, signatures) {
  301. if (!cd.config.unsignedTemplates.length) {
  302. return [];
  303. }
  304. // require() to avoid circular dependency
  305. const userRegistry = require('./userRegistry').default;
  306. const unsigneds = [];
  307. const unsignedTemplatesRegexp = new RegExp(cd.g.unsignedTemplatesPattern + '.*\\n', 'g');
  308. let match;
  309. while ((match = unsignedTemplatesRegexp.exec(adjustedCode))) {
  310. let author;
  311. let timestamp;
  312. if (cd.g.contentTimestampNoTzRegexp.test(match[2])) {
  313. timestamp = match[2];
  314. author = match[3];
  315. } else if (cd.g.contentTimestampNoTzRegexp.test(match[3])) {
  316. timestamp = match[3];
  317. author = match[2];
  318. } else {
  319. author = match[2];
  320. }
  321. author &&= userRegistry.get(decodeHtmlEntities(author));
  322. // Append "(UTC)" to the `timestamp` of templates that allow to omit the timezone. The timezone
  323. // could be not UTC, but currently the timezone offset is taken from the wiki configuration, so
  324. // doesn't have effect.
  325. if (timestamp && !cd.g.contentTimestampRegexp.test(timestamp)) {
  326. timestamp += ' (UTC)';
  327. // Workaround for "undated" templates
  328. author ||= '<undated>';
  329. }
  330. // Double spaces
  331. timestamp = timestamp?.replace(/ +/g, ' ');
  332. let startIndex = match.index;
  333. const endIndex = match.index + match[1].length;
  334. let dirtyCode = code.slice(startIndex, endIndex);
  335. const nextCommentStartIndex = match.index + match[0].length;
  336. // `[5 tildes] {{unsigned|...}}` cases. In these cases, both the signature and
  337. // {{unsigned|...}} are considered signatures and added to the array. We could combine them
  338. // but that would need corresponding code in Parser.js which could be tricky, so for now we just
  339. // remove the duplicate. That still allows to reply to the comment.
  340. const relevantSignatureIndex = (
  341. signatures.findIndex((sig) => sig.nextCommentStartIndex === nextCommentStartIndex)
  342. );
  343. if (relevantSignatureIndex !== -1) {
  344. signatures.splice(relevantSignatureIndex, 1);
  345. }
  346. unsigneds.push({ author, timestamp, startIndex, endIndex, dirtyCode, nextCommentStartIndex });
  347. }
  348. return unsigneds;
  349. }
  350. /**
  351. * Modify a string or leave it unchanged so that is has two newlines at the end of it. (Meant for
  352. * section wikitext.)
  353. *
  354. * @param {string} code
  355. * @returns {string}
  356. */
  357. export function endWithTwoNewlines(code) {
  358. return code.replace(/([^\n])\n?$/, '$1\n\n');
  359. }
  360. /**
  361. * Replace `<br>`s with `\n`, except in list elements and `<pre>`'s created by a space starting a
  362. * line.
  363. *
  364. * @param {string} code
  365. * @param {string} replacement
  366. * @returns {string}
  367. */
  368. export function brsToNewlines(code, replacement = '\n') {
  369. return code.replace(/^(?![:*# ]).*<br[ \n]*\/?>.*$/gmi, (s) => (
  370. s.replace(/<br[ \n]*\/?>(?![:*#;])\n? */gi, () => replacement)
  371. ));
  372. }
  373. /**
  374. * Mask links that have `|`, replace `|` with `{{!}}`, unmask links. If `maskedTexts` is not
  375. * provided, sensitive code will be masked as well.
  376. *
  377. * Also masks bare `{` and `}` that weren't identified as part of other markup (e.g. try quoting the
  378. * sentence "Или держал в голове то, что практика использования {{doc/begin}} ... делали </div>
  379. * вместо шаблона." in
  380. * https://ru.wikipedia.org/wiki/User_talk:Jack_who_built_the_house#c-Jack_who_built_the_house-2020-03-22T12:18:00.000Z-DonRumata-2020-03-22T11:05:00.000Z
  381. * - "</div>" screws everything up.)
  382. *
  383. * @param {string} code
  384. * @param {string[]} [maskedTexts]
  385. * @returns {string}
  386. */
  387. export function escapePipesOutsideLinks(code, maskedTexts) {
  388. const textMasker = new TextMasker(code, maskedTexts);
  389. if (!maskedTexts) {
  390. textMasker.maskSensitiveCode();
  391. }
  392. return textMasker
  393. .mask(/\[\[[^\]|]+\|/g, 'link')
  394. .withText((text) => (
  395. text
  396. .replace(/\{/g, '&#123;')
  397. .replace(/\}/g, '&#125;')
  398. .replace(/\|/g, '{{!}}')
  399. ))
  400. .unmask(maskedTexts ? 'link' : undefined)
  401. .getText();
  402. }
  403. /**
  404. * Extract a number from a string using a set of digits.
  405. *
  406. * @param {string} s
  407. * @param {string} [digits='0123456789']
  408. * @returns {number}
  409. */
  410. export function extractArabicNumeral(s, digits = '0123456789') {
  411. const digitsRegExp = new RegExp(`[${digits}]`, 'g');
  412. const notDigitsRegExp = new RegExp(`[^${digits}]`, 'g');
  413. return Number(
  414. s
  415. .replace(notDigitsRegExp, '')
  416. .replace(digitsRegExp, (s) => digits.indexOf(s))
  417. );
  418. }