session-desktop/ts/util/linkPreviews.ts

/* global URL */

import { compact, isEmpty, isNumber, range } from 'lodash';
import nodeUrl from 'url';
import LinkifyIt from 'linkify-it';

const linkify = LinkifyIt();

function maybeParseHref(href: string) {
  try {
    return new URL(href);
  } catch (err) {
    return null;
  }
}

function isLinkSafeToPreview(href: string) {
  const url = maybeParseHref(href);
  return Boolean(url && url.protocol === 'https:' && !isLinkSneaky(href));
}

function findLinks(text: string, caretLocation?: number) {
  const haveCaretLocation = isNumber(caretLocation);
  const textLength = text ? text.length : 0;

  const matches = linkify.match(text || '') || [];
  return compact(
    matches.map(match => {
      if (!haveCaretLocation) {
        return match.text;
      }

      if (match.lastIndex === textLength && caretLocation === textLength) {
        return match.text;
      }

      if (match.index > caretLocation || match.lastIndex < caretLocation) {
        return match.text;
      }

      return null;
    })
  );
}

function getDomain(href: string) {
  const url = maybeParseHref(href);
  return url ? url.hostname : null;
}

// See <https://tools.ietf.org/html/rfc3986>.
const VALID_URI_CHARACTERS = new Set([
  '%',
  // "gen-delims"
  ':',
  '/',
  '?',
  '#',
  '[',
  ']',
  '@',
  // "sub-delims"
  '!',
  '$',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  ';',
  '=',
  // unreserved
  ...String.fromCharCode(...range(65, 91), ...range(97, 123)),
  ...range(10).map(String),
  '-',
  '.',
  '_',
  '~',
]);
const ASCII_PATTERN = new RegExp('[\\u0020-\\u007F]', 'g');
const MAX_HREF_LENGTH = 2 ** 12;

function isLinkSneaky(href: string) {
  // This helps users avoid extremely long links (which could be hiding something
  //   sketchy) and also sidesteps the performance implications of extremely long hrefs.
  if (href.length > MAX_HREF_LENGTH) {
    return true;
  }

  const url = maybeParseHref(href);

  // If we can't parse it, it's sneaky.
  if (!url) {
    return true;
  }

  // Any links which contain auth are considered sneaky
  if (url.username) {
    return true;
  }

  // If the domain is falsy, something fishy is going on
  if (!url.hostname) {
    return true;
  }

  // To quote [RFC 1034][0]: "the total number of octets that represent a
  //   domain name [...] is limited to 255." To be extra careful, we set a
  //   maximum of 2048. (This also uses the string's `.length` property,
  //   which isn't exactly the same thing as the number of octets.)
  // [0]: https://tools.ietf.org/html/rfc1034
  if (url.hostname.length > 2048) {
    return true;
  }

  // Domains cannot contain encoded characters
  if (url.hostname.includes('%')) {
    return true;
  }

  // There must be at least 2 domain labels, and none of them can be empty.
  const labels = url.hostname.split('.');
  if (labels.length < 2 || labels.some(isEmpty)) {
    return true;
  }

  // This is necessary because getDomain returns domains in punycode form.
  const unicodeDomain = nodeUrl.domainToUnicode
    ? nodeUrl.domainToUnicode(url.hostname)
    : url.hostname;

  const withoutPeriods = unicodeDomain.replace(/\./g, '');

  const hasASCII = ASCII_PATTERN.test(withoutPeriods);
  const withoutASCII = withoutPeriods.replace(ASCII_PATTERN, '');

  const isMixed = hasASCII && withoutASCII.length > 0;
  if (isMixed) {
    return true;
  }

  // We can't use `url.pathname` (and so on) because it automatically encodes strings.
  //   For example, it turns `/aquí` into `/aqu%C3%AD`.
  const startOfPathAndHash = href.indexOf('/', url.protocol.length + 4);
  const pathAndHash = startOfPathAndHash === -1 ? '' : href.substr(startOfPathAndHash);
  return [...pathAndHash].some(character => !VALID_URI_CHARACTERS.has(character));
}

export const LinkPreviews = { isLinkSneaky, getDomain, findLinks, isLinkSafeToPreview };
Link Previews 7 years ago			`/* global URL */`

move linkPreviews.js to ts 4 years ago			`import { compact, isEmpty, isNumber, range } from 'lodash';`
			`import nodeUrl from 'url';`
			`import LinkifyIt from 'linkify-it';`
Link Previews 7 years ago
			`const linkify = LinkifyIt();`

move linkPreviews.js to ts 4 years ago			`function maybeParseHref(href: string) {`
Link Previews 7 years ago			`try {`
add staged link previews for everything from signal 5 years ago			`return new URL(href);`
			`} catch (err) {`
			`return null;`
Link Previews 7 years ago			`}`
			`}`

move linkPreviews.js to ts 4 years ago			`function isLinkSafeToPreview(href: string) {`
add staged link previews for everything from signal 5 years ago			`const url = maybeParseHref(href);`
			`return Boolean(url && url.protocol === 'https:' && !isLinkSneaky(href));`
Link Previews 7 years ago			`}`

move linkPreviews.js to ts 4 years ago			`function findLinks(text: string, caretLocation?: number) {`
Fine-tune linkification technique for link previews 7 years ago			`const haveCaretLocation = isNumber(caretLocation);`
			`const textLength = text ? text.length : 0;`

Link Previews 7 years ago			`const matches = linkify.match(text \|\| '') \|\| [];`
Fine-tune linkification technique for link previews 7 years ago			`return compact(`
			`matches.map(match => {`
			`if (!haveCaretLocation) {`
			`return match.text;`
			`}`

			`if (match.lastIndex === textLength && caretLocation === textLength) {`
			`return match.text;`
			`}`

			`if (match.index > caretLocation \|\| match.lastIndex < caretLocation) {`
			`return match.text;`
			`}`

			`return null;`
			`})`
			`);`
Link Previews 7 years ago			`}`

move linkPreviews.js to ts 4 years ago			`function getDomain(href: string) {`
add staged link previews for everything from signal 5 years ago			`const url = maybeParseHref(href);`
			`return url ? url.hostname : null;`
Link Previews 7 years ago			`}`

add staged link previews for everything from signal 5 years ago			`// See <https://tools.ietf.org/html/rfc3986>.`
			`const VALID_URI_CHARACTERS = new Set([`
			`'%',`
			`// "gen-delims"`
			`':',`
			`'/',`
			`'?',`
			`'#',`
			`'[',`
			`']',`
			`'@',`
			`// "sub-delims"`
			`'!',`
			`'$',`
			`'&',`
			`"'",`
			`'(',`
			`')',`
			`'*',`
			`'+',`
			`',',`
			`';',`
			`'=',`
			`// unreserved`
			`...String.fromCharCode(...range(65, 91), ...range(97, 123)),`
			`...range(10).map(String),`
			`'-',`
			`'.',`
			`'_',`
			`'~',`
			`]);`
			`const ASCII_PATTERN = new RegExp('[\\u0020-\\u007F]', 'g');`
			`const MAX_HREF_LENGTH = 2 ** 12;`

move linkPreviews.js to ts 4 years ago			`function isLinkSneaky(href: string) {`
add staged link previews for everything from signal 5 years ago			`// This helps users avoid extremely long links (which could be hiding something`
			`// sketchy) and also sidesteps the performance implications of extremely long hrefs.`
			`if (href.length > MAX_HREF_LENGTH) {`
			`return true;`
Link Previews 7 years ago			`}`

add staged link previews for everything from signal 5 years ago			`const url = maybeParseHref(href);`
Link Previews 7 years ago
add staged link previews for everything from signal 5 years ago			`// If we can't parse it, it's sneaky.`
			`if (!url) {`
			`return true;`
Link Previews 7 years ago			`}`

add staged link previews for everything from signal 5 years ago			`// Any links which contain auth are considered sneaky`
			`if (url.username) {`
			`return true;`
Link Previews 7 years ago			`}`

add staged link previews for everything from signal 5 years ago			`// If the domain is falsy, something fishy is going on`
			`if (!url.hostname) {`
			`return true;`
Improve handling for URLs composed of mixed character sets 7 years ago			`}`

add staged link previews for everything from signal 5 years ago			`// To quote [RFC 1034][0]: "the total number of octets that represent a`
			`// domain name [...] is limited to 255." To be extra careful, we set a`
			// maximum of 2048. (This also uses the string's `.length` property,
			`// which isn't exactly the same thing as the number of octets.)`
			`// [0]: https://tools.ietf.org/html/rfc1034`
			`if (url.hostname.length > 2048) {`
Improve handling for URLs composed of mixed character sets 7 years ago			`return true;`
			`}`

add staged link previews for everything from signal 5 years ago			`// Domains cannot contain encoded characters`
			`if (url.hostname.includes('%')) {`
Improve handling for URLs composed of mixed character sets 7 years ago			`return true;`
			`}`

add staged link previews for everything from signal 5 years ago			`// There must be at least 2 domain labels, and none of them can be empty.`
			`const labels = url.hostname.split('.');`
			`if (labels.length < 2 \|\| labels.some(isEmpty)) {`
Improve handling for URLs composed of mixed character sets 7 years ago			`return true;`
			`}`

Fix typos via codespell 3 years ago			`// This is necessary because getDomain returns domains in punycode form.`
add staged link previews for everything from signal 5 years ago			`const unicodeDomain = nodeUrl.domainToUnicode`
			`? nodeUrl.domainToUnicode(url.hostname)`
			`: url.hostname;`
Improve handling for URLs composed of mixed character sets 7 years ago
add staged link previews for everything from signal 5 years ago			`const withoutPeriods = unicodeDomain.replace(/\./g, '');`
Improve handling for URLs composed of mixed character sets 7 years ago
add staged link previews for everything from signal 5 years ago			`const hasASCII = ASCII_PATTERN.test(withoutPeriods);`
			`const withoutASCII = withoutPeriods.replace(ASCII_PATTERN, '');`
Improve handling for URLs composed of mixed character sets 7 years ago
add staged link previews for everything from signal 5 years ago			`const isMixed = hasASCII && withoutASCII.length > 0;`
			`if (isMixed) {`
			`return true;`
Improve handling for URLs composed of mixed character sets 7 years ago			`}`

add staged link previews for everything from signal 5 years ago			// We can't use `url.pathname` (and so on) because it automatically encodes strings.
			// For example, it turns `/aquí` into `/aqu%C3%AD`.
			`const startOfPathAndHash = href.indexOf('/', url.protocol.length + 4);`
increase prettier maxWidth to 100 5 years ago			`const pathAndHash = startOfPathAndHash === -1 ? '' : href.substr(startOfPathAndHash);`
			`return [...pathAndHash].some(character => !VALID_URI_CHARACTERS.has(character));`
Improve handling for URLs composed of mixed character sets 7 years ago			`}`
move linkPreviews.js to ts 4 years ago
			`export const LinkPreviews = { isLinkSneaky, getDomain, findLinks, isLinkSafeToPreview };`