src/Parser.js - Documentation

// Here, we use vanilla JavaScript for recurring operations that together take up a lot of time.

import CommentSkeleton from './CommentSkeleton';
import ElementsAndTextTreeWalker from './ElementsAndTextTreeWalker';
import ElementsTreeWalker from './ElementsTreeWalker';
import cd from './cd';
import { getHeadingLevel, parseWikiUrl, isHeadingNode, isInline, isMetadataNode, ucFirst, underlinesToSpaces, definedAndNotNull, unique } from './utils-general';
import { parseTimestamp } from './utils-timestamp';

/**
 * @typedef {object} Context
 * @property {Function} CommentClass
 * @property {Function} SectionClass
 * @property {string} childElementsProp
 * @property {Function} follows
 * @property {Function} getAllTextNodes
 * @property {Function} getElementByClassName
 * @property {Element|external:Element} rootElement
 * @property {boolean} areThereOutdents
 * @property {Function} processAndRemoveDtElements
 * @property {Function} removeDtButtonHtmlComments
 */

/**
 * Generalization of a web page (not wikitext) parser for the window and worker contexts. Parsing
 * here means "extracting meaningful parts from the page" such as comments, sections, etc. Functions
 * related to wikitext parsing go in {@link module:wikitext}.
 */
class Parser {
  /**
   * Create a page parser in the provided context.
   *
   * @param {Context} context Collection of classes, functions, and other properties that perform
   *   the tasks we need in the current context (window or worker).
   */
  constructor(context) {
    this.context = context;
    this.existingCommentIds = [];
  }

  /**
   * Set some properties and find some elements required for parsing.
   *
   * @private
   */
  init() {
    this.rejectClasses = [
      'cd-comment-part',

      // Extension:Translate
      'mw-pt-languages',

      // Likely won't have much effect, but won't hurt
      'mw-archivedtalk',

      // For templates like https://ru.wikipedia.org/wiki/Template:Сложное_обсуждение (perhaps they
      // need to be `tmbox` too?).
      'ombox',

      ...cd.config.closedDiscussionClasses,
      cd.config.outdentClass,
    ];

    // Example of a comment in a figure element:
    // https://ru.wikipedia.org/w/index.php?title=Википедия%3AФорум%2FНовости&diff=prev&oldid=131939933
    const tagSelector = ['blockquote', 'q', 'cite', 'figure', 'th'].join(', ');

    const classSelector = cd.g.noSignatureClasses.map((name) => `.${name}`).join(', ');

    this.noSignatureElements = [
      ...this.context.rootElement.querySelectorAll(`${tagSelector}, ${classSelector}`),
    ];
  }

  /**
   * Create a comment instance.
   *
   * @param {Element|external:Element} signature
   * @param {object[]} targets
   * @returns {*}
   */
  createComment(signature, targets) {
    return new this.context.CommentClass(this, signature, targets);
  }

  /**
   * Create a section instance.
   *
   * @param {object} heading
   * @param {object[]} targets
   * @param {import('./Subscriptions').default} subscriptions
   * @returns {*}
   */
  createSection(heading, targets, subscriptions) {
    return new this.context.SectionClass(this, heading, targets, subscriptions);
  }

  /**
   * _For internal use._ Remove some of the elements added by the DiscussionTools extension (even if
   * it is disabled in user preferences) or move them away if the topic subscriptions feature of DT
   * is enabled (to avoid errors being thrown in DT). Prior to that, extract data from them.
   *
   * CD already parses comment links from notifications (which seems to be this markup's purpose for
   * disabled DT) in `BootProcess#processTargets()`. Unless the elements prove useful to CD or other
   * scripts, it's better to get rid of them rather than deal with them one by one while parsing.
   *
   * @param {import('./BootProcess').default} [bootProcess]
   */
  processAndRemoveDtMarkup(bootProcess) {
    const elements = [...this.context.rootElement.getElementsByTagName('span')]
      .filter((el) => (
        el.hasAttribute('data-mw-comment-start') ||
        el.hasAttribute('data-mw-comment-end') ||

        // This, in fact, targets the one span at the top of the page, out of sections which makes
        // comments taller (example:
        // https://commons.wikimedia.org/w/index.php?title=User_talk:Jack_who_built_the_house/CD_test_page&oldid=876639400).
        // Check for classes and content because in older DT versions, `data-mw-thread-id` was on
        // the .mw-headline element.
        (
          el.tagName === 'SPAN' &&
          el.hasAttribute('data-mw-thread-id') &&
          !el.classList.length &&
          !el.textContent
        )
      ))
      .concat(
        [...this.context.rootElement.getElementsByClassName('ext-discussiontools-init-replylink-buttons')]
      )
      .filter(unique);
    this.context.processAndRemoveDtElements(elements, bootProcess);
    this.context.removeDtButtonHtmlComments();
  }

  /**
   * Handle outdent character sequences added by
   * {@link https://en.wikipedia.org/wiki/User:Alexis_Jazz/Factotum Factotum}.
   *
   * @param {string} text
   * @param {Node|external:Node} node
   * @private
   */
  handleFactotumOutdents(text, node) {
    if (
      !/^┌─*┘$/.test(text) ||
      node.parentNode.classList.contains(cd.config.outdentClass) ||
      node.parentNode.parentNode.classList.contains(cd.config.outdentClass)
    ) {
      return;
    }

    const span = document.createElement('span');
    span.className = cd.config.outdentClass;
    span.textContent = text;
    if (node.nextSibling?.tagName === 'BR') {
      node.nextSibling.remove();
    }

    // Don't have Node#replaceChild() in the worker.
    node.parentNode.insertBefore(span, node);
    node.remove();
  }

  /**
   * @typedef {object} Timestamp
   * @property {Element|external:Element} element
   * @property {Date} date
   * @property {object} [match]
   * @memberof Parser
   * @inner
   */

  /**
   * Find a timestamp in a text node.
   *
   * @param {Node} node
   * @returns {?Timestamp}
   * @private
   */
  findTimestamp(node) {
    const text = node.textContent;

    // While we're here, wrap outdents inserted by Factotum into a span.
    this.handleFactotumOutdents(text, node);

    const { date, match } = parseTimestamp(text) || {};
    if (!date || this.noSignatureElements.some((el) => el.contains(node))) {
      return null;
    }

    const element = document.createElement('span');
    element.classList.add('cd-timestamp');
    element.appendChild(document.createTextNode(match[2]));
    const remainedText = node.textContent.slice(match.index + match[0].length);
    const afterNode = remainedText ? document.createTextNode(remainedText) : undefined;
    node.textContent = match[1];
    node.parentNode.insertBefore(element, node.nextSibling);
    if (afterNode) {
      node.parentNode.insertBefore(afterNode, element.nextSibling);
    }
    return { element, date };
  }

  /**
   * Collect nodes related to a signature starting from a timestamp node.
   *
   * @param {object} timestamp
   * @returns {?object}
   * @private
   */
  getSignatureFromTimestamp(timestamp) {
    let unsignedElement;
    {
      let el = timestamp.element;
      while (!unsignedElement && (el = el.parentNode) && isInline(el) !== false) {
        if (el.classList.contains(cd.config.unsignedClass)) {
          unsignedElement = el;
        }
      }
    }

    // If the closest block-level timestamp element ancestor has more than one signature, we choose
    // the first signature to consider it the signature of the comment author while keeping the
    // last. There is no point for us to parse them as distinct comments as a reply posted using our
    // script will go below all of them anyway.
    let isExtraSignature = false;
    const elementsTreeWalker = new ElementsTreeWalker(timestamp.element, this.context.rootElement);
    while (
      elementsTreeWalker.previousNode() &&
      (
        isInline(elementsTreeWalker.currentNode) !== false ||
        isMetadataNode(elementsTreeWalker.currentNode)
      )
    ) {
      if (elementsTreeWalker.currentNode.classList.contains('cd-signature')) {
        isExtraSignature = true;
        break;
      }
    }

    const startElement = unsignedElement || timestamp.element;
    const treeWalker = new ElementsAndTextTreeWalker(startElement, this.context.rootElement);
    const authorData = {};

    let length = 0;
    let firstSignatureElement;
    let signatureNodes = [];
    if (unsignedElement) {
      firstSignatureElement = startElement;
    } else {
      signatureNodes.push(startElement);
      treeWalker.previousSibling();
    }

    // Unsigned template may be of the "undated" kind - containing a timestamp but no author name,
    // so we need to walk the tree anyway.
    let node = treeWalker.currentNode;
    do {
      length += node.textContent.length;
      if (node.tagName) {
        authorData.isLastLinkAuthorLink = false;

        if (node.tagName === 'A') {
          if (!this.constructor.processLinkData(node, authorData)) break;
        } else {
          const links = [...node.getElementsByTagName('a')].reverse();
          for (const link of links) {
            // https://en.wikipedia.org/wiki/Template:Talkback and similar cases
            if (link.classList.contains('external')) continue;

            this.constructor.processLinkData(link, authorData);
          }
        }

        if (authorData.isLastLinkAuthorLink) {
          firstSignatureElement = node;
        }
      }
      signatureNodes.push(node);

      node = treeWalker.previousSibling();
      if (!node && !firstSignatureElement) {
        node = treeWalker.parentNode();
        if (!node || isInline(node) === false) break;
        length = 0;
        signatureNodes = [];
      }
    } while (
      node &&
      isInline(node, true) !== false &&
      length < cd.config.signatureScanLimit &&
      !(
        (
          authorData.name &&
          (
            // Users may cross out the text ended with their signature and sign again
            // (https://ru.wikipedia.org/?diff=114726134). The strike element shouldn't be
            // considered a part of the signature then.
            (node.tagName && ['S', 'STRIKE', 'DEL'].includes(node.tagName)) ||

            // Cases with a talk page link at the end of comment's text like
            // https://ru.wikipedia.org/wiki/Википедия:Заявки_на_статус_администратора/Obersachse_3#c-Obersachse-2012-03-11T08:03:00.000Z-Итог
            // Note that this is currently unsupported by our wikitext parser. When edited, such a
            // comment will be cut at the first user link. You would need to discern ". " inside and
            // outside of links or even tags, and this is much work for little gain. This is the
            // cost of us not relying on a DOM -> wikitext correspondence and processing these parts
            // separately.
            (!node.tagName && this.constructor.punctuationRegexp.test(node.textContent)) ||

            (
              node.tagName &&

              (
                // Invisible pings, like
                // https://he.wikipedia.org/w/index.php?title=שיחה:שפת_אמת&oldid=38365117#c-אייל-20240205174400-אייל-20240205172600
                /display: *none/.test(node.getAttribute('style')) ||

                this.noSignatureElements.some((noSigEl) => noSigEl === node)
              )
            )
          )
        ) ||
        (
          node.tagName &&
          (
            node.classList.contains('cd-timestamp') ||
            node.classList.contains('cd-signature') ||

            // Workaround for cases like https://en.wikipedia.org/?diff=1042059387 (those should be
            // extremely rare).
            (['S', 'STRIKE', 'DEL'].includes(node.tagName) && length >= 30)
          )
        )
      )
    );

    if (!authorData.name) {
      return null;
    }

    if (!signatureNodes.length) {
      signatureNodes = [startElement];
    }

    const fseIndex = signatureNodes.indexOf(firstSignatureElement);
    signatureNodes.splice(fseIndex === -1 ? 1 : fseIndex + 1);

    const signatureContainer = signatureNodes[0].parentNode;
    const startElementNextSibling = signatureNodes[0].nextSibling;
    const element = document.createElement('span');
    element.classList.add('cd-signature');
    signatureNodes.reverse().forEach(element.appendChild.bind(element));
    signatureContainer.insertBefore(element, startElementNextSibling);

    return {
      element,
      timestampElement: timestamp.element,
      timestampText: timestamp.element.textContent,
      date: timestamp.date,
      authorLink: authorData.link,
      authorTalkLink: authorData.talkLink,
      authorName: authorData.name,
      isUnsigned: Boolean(unsignedElement),
      isExtraSignature,
    };
  }

  /**
   * Find outputs of unsigned templates.
   *
   * @returns {object[]}
   */
  findUnsigneds() {
    if (!cd.config.unsignedClass) {
      return [];
    }

    const unsigneds = [];
    [...this.context.rootElement.getElementsByClassName(cd.config.unsignedClass)]
      .filter((element) => {
        // Only templates with no timestamp interest us.
        if (this.context.getElementByClassName(element, 'cd-timestamp')) {
          return false;
        }

        // Cases like https://ru.wikipedia.org/?diff=84883816
        for (let el = element; el && el !== this.context.rootElement; el = el.parentNode) {
          if (el.classList.contains('cd-signature')) {
            return false;
          }
        }

        return true;
      })
      .forEach((element) => {
        [...element.getElementsByTagName('a')].some((link) => {
          const { userName: authorName, linkType } = this.constructor.processLink(link) || {};
          if (authorName) {
            let authorLink;
            let authorTalkLink;
            if (linkType === 'user') {
              authorLink = link;
            } else if (linkType === 'userTalk') {
              authorTalkLink = link;
            }
            element.classList.add('cd-signature');
            const isUnsigned = true;
            unsigneds.push({
              element,
              authorName,
              isUnsigned,
              authorLink,
              authorTalkLink,
            });
            return true;
          }
        });
      });

    return unsigneds;
  }

  /**
   * _For internal use._ Find signatures under the root element.
   *
   * Characters before the author link, like "—", aren't considered a part of the signature.
   *
   * @returns {object[]}
   */
  findSignatures() {
    const signatures = this.context.getAllTextNodes()
      .map(this.findTimestamp.bind(this))
      .filter(definedAndNotNull)
      .map(this.getSignatureFromTimestamp.bind(this))
      .filter(definedAndNotNull)
      .concat(this.findUnsigneds());

    // Move extra signatures (additional signatures for a comment, if there is more than one) to an
    // array which then assign to a relevant signature (the one which goes first).
    let extraSignatures = [];
    return signatures
      .slice()
      .reverse()
      .map((sig) => {
        if (sig.isExtraSignature) {
          extraSignatures.push(sig);
        } else {
          sig.extraSignatures = extraSignatures;
          extraSignatures = [];
        }
        return Object.assign({ type: 'signature' }, sig);
      })
      .filter((sig) => !sig.isExtraSignature);
  }

  /**
   * With code like this:
   *
   * ```html
   * * Smth. [signature]
   * :: Smth. [signature]
   * ```
   *
   * one comment (preceded by :: in this case) creates its own list tree, not a subtree, even though
   * it's a reply to a reply. So we dive as deep to the bottom of the hierarchy of nested lists as
   * we can to get the top nodes with comment content (and therefore draw comment layers more
   * accurately). One of the most complex tree structures is this:
   *
   * ```html
   *  * Smth. [signature]
   *  :* Smth.
   *  :: Smth. [signature]
   * ```
   *
   * (seen here:
   * {@link https://ru.wikipedia.org/w/index.php?title=Википедия:Форум/Общий&oldid=103760740#201912010211_Mikhail_Ryazanov})
   * It has a branchy structure that requires a tricky algorithm to be parsed correctly.
   *
   * @param {Element|external:Element} element
   * @param {boolean} [onlyChildrenWithoutCommentLevel=false]
   * @returns {object}
   */
  getTopElementsWithText(element, onlyChildrenWithoutCommentLevel = false) {
    // We ignore all spaces as an easy way to ignore only whitespace text nodes between element
    // nodes (this is a bad idea if we deal with inline nodes, but here we deal with lists).
    const partTextNoSpaces = element.textContent.replace(/\s+/g, '');

    let nodes;
    let children = [element];
    let levelsPassed = 0;
    do {
      nodes = children;
      children = nodes.reduce(
        (arr, element) => arr.concat([...element[this.context.childElementsProp]]),
        []
      );
      if (['DL', 'UL', 'OL'].includes(nodes[0].tagName)) {
        levelsPassed++;
      }
    } while (
      children.length &&
      children.every((child) => (
        (
          ['DL', 'UL', 'OL', 'DD', 'LI'].includes(child.tagName) &&
          (
            !onlyChildrenWithoutCommentLevel ||
            ['DD', 'LI'].includes(child.tagName) ||
            child.classList.contains('cd-commentLevel')
          )
        ) ||

        // An inline (e.g., <small>) tag wrapped around block tags can give that (due to some errors
        // in the markup).
        (!child.textContent.trim() && isInline(child))
      )) &&
      children.map((child) => child.textContent).join('').replace(/\s+/g, '') === partTextNoSpaces
    );

    return { nodes, levelsPassed };
  }

  /**
   * _For internal use._ Get all headings on the page.
   *
   * @returns {object[]}
   */
  findHeadings() {
    return [...this.context.rootElement.querySelectorAll('h1, h2, h3, h4, h5, h6')]
      .map((element) => {
        for (let el = element; el && el !== this.context.rootElement; el = el.parentNode) {
          if (el.classList.contains('mw-heading')) {
            return el;
          }
        }
        return element;
      })
      .filter((el) => (
        el.getAttribute('id') !== 'mw-toc-heading' &&
        !this.noSignatureElements.some((noSigEl) => noSigEl.contains(el))
      ))
      .map((element) => ({
        type: 'heading',
        isWrapper: !isHeadingNode(element, true),
        level: getHeadingLevel(element),
        element,
      }));
  }

  /**
   * Turn a structure like this
   *
   * ```html
   * <dd>
   *   <div>Comment. [signature]</div>
   *   <ul>...</ul>
   * </dd>
   * ```
   *
   * into a structure like this
   *
   * ```html
   * <dd>
   *   <div>Comment. [signature]</div>
   * </dd>
   * <dd>
   *   <ul>...</ul>
   * </dd>
   * ```
   *
   * by splitting the parent node of the given node, moving all the following nodes into the second
   * node resulting from the split. If there is no following nodes, don't perform the split.
   *
   * @param {Element|external:Element} node Reference node.
   * @returns {Array.<Element|external:Element>} The parent nodes resultant from the split (at least
   *   one).
   */
  splitParentAfterNode(node) {
    const parent = node.parentNode;
    const clone = parent.cloneNode();
    let lastChild;
    while ((lastChild = parent.lastChild) && lastChild !== node) {
      clone.insertBefore(lastChild, clone.firstChild);
    }
    if (clone[this.context.childElementsProp].length > 0) {
      parent.parentNode.insertBefore(clone, parent.nextSibling);
    }
    return [parent, clone];
  }

  /**
   * @typedef {string[]} ProcessLinkReturn
   * @property {string} userName User name.
   * @property {?string} linkType Link type (`user`, `userTalk`, `contribs`, `userSubpage`,
   *   `userTalkSubpage`, or any of this `Foreign` at the end).
   * @memberof Parser
   * @inner
   */

  /**
   * _For internal use._ Get a user name from a link, along with some other data about a page name.
   *
   * @param {Element|external:Element} element
   * @returns {?ProcessLinkReturn}
   */
  static processLink(element) {
    const href = element.getAttribute('href');
    let userName;
    let linkType = null;
    if (href) {
      const { pageName, hostname, fragment } = parseWikiUrl(href) || {};
      if (!pageName || CommentSkeleton.isAnyId(fragment)) {
        return null;
      }

      const match = pageName.match(cd.g.userNamespacesRegexp);
      if (match) {
        userName = match[1];
        if (cd.g.userLinkRegexp.test(pageName)) {
          linkType = 'user';
        } else if (cd.g.userTalkLinkRegexp.test(pageName)) {
          linkType = 'userTalk';
        } else if (cd.g.userSubpageLinkRegexp.test(pageName)) {
          linkType = 'userSubpage';
        } else if (cd.g.userTalkSubpageLinkRegexp.test(pageName)) {
          linkType = 'userTalkSubpage';
        }

        // Another alternative is a user link to another site where the prefix is specified before
        // the namespace. Enough to capture the user name from, not enough to make any inferences.
      } else if (cd.g.contribsPageLinkRegexp.test(pageName)) {
        userName = pageName.replace(cd.g.contribsPageLinkRegexp, '');
        if (cd.g.isIPv6Address?.(userName)) {
          userName = userName.toUpperCase();
        }
        linkType = 'contribs';
      }
      if (hostname !== cd.g.serverName) {
        linkType += 'Foreign';
      }
      userName &&= ucFirst(underlinesToSpaces(userName.replace(/\/.*/, ''))).trim();
    } else {
      if (
        element.classList.contains('mw-selflink') &&
        cd.g.namespaceNumber === 3 &&
        !cd.g.pageName.includes('/')
      ) {
        // Comments of users that have only the user talk page link in their signature on their talk
        // page.
        userName = cd.g.pageTitle;
      } else {
        return null;
      }
    }

    return { userName, linkType };
  }

  /**
   * Given a link node, enrich the author data and return a boolean denoting whether the node is a
   * part of the signature.
   *
   * @param {Element|external:Element} link
   * @param {object} authorData
   * @returns {boolean}
   * @private
   */
  static processLinkData(link, authorData) {
    const { userName, linkType } = this.processLink(link) || {};
    if (userName) {
      authorData.name ||= userName;
      if (authorData.name === userName) {
        if (['user', 'userForeign'].includes(linkType)) {
          // Break only when the second user link is a link to another wiki (but not the other way
          // around, see an example: https://en.wikipedia.org/?diff=1012665097).
          if (authorData.notForeignLink && linkType === 'userForeign') {
            return false;
          }
          if (linkType !== 'userForeign') {
            authorData.notForeignLink = link;
          }
          authorData.link = link;
        } else if (['userTalk', 'userTalkForeign'].includes(linkType)) {
          if (authorData.talkNotForeignLink) {
            return false;
          }
          if (linkType !== 'userTalkForeign') {
            authorData.talkNotForeignLink = link;
          }
          authorData.talkLink = link;
        } else if (['contribs', 'contribsForeign'].includes(linkType)) {
          // authorData.contribsNotForeignLink is used only to make sure there are no two contribs
          // links to the current hostname in a signature.
          if (authorData.contribsNotForeignLink && (authorData.link || authorData.talkLink)) {
            return false;
          }
          if (linkType !== 'contribsForeign') {
            authorData.contribsNotForeignLink = link;
          }
        } else if (['userSubpage', 'userSubpageForeign'].includes(linkType)) {
          // A user subpage link after a user link is OK. A user subpage link before a user link is
          // not OK (example: https://ru.wikipedia.org/?diff=112885854). Perhaps part of the
          // comment.
          if (authorData.link || authorData.talkLink) {
            return false;
          }
        } else if (['userTalkSubpage', 'userTalkSubpageForeign'].includes(linkType)) {
          // Same as with a user page above.
          if (authorData.link || authorData.talkLink) {
            return false;
          }
        } else {
          // Cases like https://ru.wikipedia.org/?diff=115909247
          if (authorData.link || authorData.talkLink) {
            return false;
          }
        }
        authorData.isLastLinkAuthorLink = true;
      } else {
        // Don't return false here in case the user mentioned a redirect to their user page here.
      }
    }
    return true;
  }

  /**
   * Get a nesting level of an element relative to the root element.
   *
   * @param {Element} element
   * @returns {number}
   */
  getNestingLevel(element) {
    const treeWalker = new ElementsTreeWalker(element, this.context.rootElement);
    let nestingLevel = 0;
    while (treeWalker.parentNode()) {
      nestingLevel++;
    }

    return nestingLevel;
  }

  /**
   * Initialize the class.
   */
  static init() {
    // Parenthesis for the case `smth). ~~~~`
    // https://ru.wikipedia.org/w/index.php?title=Википедия:Форум/Новости&oldid=138050961#c-Lesless-20240526055500-Deinocheirus-20240525165500
    // Non-Latin punctuation is collected manually from https://en.wikipedia.org/wiki/Full_stop and
    // other sources.
    this.punctuationRegexp = new RegExp(`(?:^|[${cd.g.letterPattern}])[)\\]]*(?:[.!?…।։။۔]+ |[。！？]+)`);
  }
}

export default Parser;