enable link preview for any sites

pull/309/head
Ryan ZHAO 5 years ago
parent 93c7c94204
commit 8bfd4edb53

@ -0,0 +1,119 @@
import Foundation
public struct HTMLMetadata: Equatable {
/// Parsed from <title>
var titleTag: String?
/// Parsed from <link rel="icon"...>
var faviconUrlString: String?
/// Parsed from <meta name="description"...>
var description: String?
/// Parsed from the og:title meta property
var ogTitle: String?
/// Parsed from the og:description meta property
var ogDescription: String?
/// Parsed from the og:image or og:image:url meta property
var ogImageUrlString: String?
/// Parsed from the og:published_time meta property
var ogPublishDateString: String?
/// Parsed from article:published_time meta property
var articlePublishDateString: String?
/// Parsed from the og:modified_time meta property
var ogModifiedDateString: String?
/// Parsed from the article:modified_time meta property
var articleModifiedDateString: String?
static func construct(parsing rawHTML: String) -> HTMLMetadata {
let metaPropertyTags = Self.parseMetaProperties(in: rawHTML)
return HTMLMetadata(
titleTag: Self.parseTitleTag(in: rawHTML),
faviconUrlString: Self.parseFaviconUrlString(in: rawHTML),
description: Self.parseDescriptionTag(in: rawHTML),
ogTitle: metaPropertyTags["og:title"],
ogDescription: metaPropertyTags["og:description"],
ogImageUrlString: (metaPropertyTags["og:image"] ?? metaPropertyTags["og:image:url"]),
ogPublishDateString: metaPropertyTags["og:published_time"],
articlePublishDateString: metaPropertyTags["article:published_time"],
ogModifiedDateString: metaPropertyTags["og:modified_time"],
articleModifiedDateString: metaPropertyTags["article:modified_time"]
)
}
}
// MARK: - Parsing
extension HTMLMetadata {
private static func parseTitleTag(in rawHTML: String) -> String? {
titleRegex
.firstMatchSet(in: rawHTML)?
.group(idx: 0)
.flatMap { decodeHTMLEntities(in: String($0)) }
}
private static func parseFaviconUrlString(in rawHTML: String) -> String? {
guard let matchedTag = faviconRegex
.firstMatchSet(in: rawHTML)
.map({ String($0.fullString) }) else { return nil }
return faviconUrlRegex
.parseFirstMatch(inText: matchedTag)
.flatMap { decodeHTMLEntities(in: String($0)) }
}
private static func parseDescriptionTag(in rawHTML: String) -> String? {
guard let matchedTag = metaDescriptionRegex
.firstMatchSet(in: rawHTML)
.map({ String($0.fullString) }) else { return nil }
return metaContentRegex
.parseFirstMatch(inText: matchedTag)
.flatMap { decodeHTMLEntities(in: String($0)) }
}
private static func parseMetaProperties(in rawHTML: String) -> [String: String] {
metaPropertyRegex
.allMatchSets(in: rawHTML)
.reduce(into: [:]) { (builder, matchSet) in
guard let ogTypeSubstring = matchSet.group(idx: 0) else { return }
let ogType = String(ogTypeSubstring)
let fullTag = String(matchSet.fullString)
// Exit early if we've already found a tag of this type
guard builder[ogType] == nil else { return }
guard let content = metaContentRegex.parseFirstMatch(inText: fullTag) else { return }
builder[ogType] = decodeHTMLEntities(in: content)
}
}
private static func decodeHTMLEntities(in string: String) -> String? {
guard let data = string.data(using: .utf8) else {
return nil
}
let options: [NSAttributedString.DocumentReadingOptionKey: Any] = [
.documentType: NSAttributedString.DocumentType.html,
.characterEncoding: String.Encoding.utf8.rawValue
]
guard let attributedString = try? NSAttributedString(data: data, options: options, documentAttributes: nil) else {
return nil
}
return attributedString.string
}
}
// MARK: - Regular Expressions
extension HTMLMetadata {
static let titleRegex = regex(pattern: "<\\s*title[^>]*>(.*?)<\\s*/title[^>]*>")
static let faviconRegex = regex(pattern: "<\\s*link[^>]*rel\\s*=\\s*\"\\s*(shortcut\\s+)?icon\\s*\"[^>]*>")
static let faviconUrlRegex = regex(pattern: "href\\s*=\\s*\"([^\"]*)\"")
static let metaDescriptionRegex = regex(pattern: "<\\s*meta[^>]*name\\s*=\\s*\"\\s*description[^\"]*\"[^>]*>")
static let metaPropertyRegex = regex(pattern: "<\\s*meta[^>]*property\\s*=\\s*\"\\s*([^\"]+?)\"[^>]*>")
static let metaContentRegex = regex(pattern: "content\\s*=\\s*\"([^\"]*?)\"")
static private func regex(pattern: String) -> NSRegularExpression {
try! NSRegularExpression(
pattern: pattern,
options: [.dotMatchesLineSeparators, .caseInsensitive])
}
}

@ -291,83 +291,6 @@ public class OWSLinkPreview: MTLModel {
return result.filterStringForDisplay()
}
// MARK: - Whitelists
// For link domains, we require an exact match - no subdomains allowed.
//
// Note that order matters in this whitelist since the logic for determining
// how to render link preview domains in displayDomain(...) uses the first match.
// We should list TLDs first and subdomains later.
private static let linkDomainWhitelist = [
// YouTube
"youtube.com",
"www.youtube.com",
"m.youtube.com",
"youtu.be",
// Reddit
"reddit.com",
"www.reddit.com",
"m.reddit.com",
// NOTE: We don't use redd.it.
// Imgur
//
// NOTE: Subdomains are also used for content.
//
// For example, you can access "user/member" pages: https://sillygoose2.imgur.com/
// A different member page can be accessed without a subdomain: https://imgur.com/user/SillyGoose2
//
// I'm not sure we need to support these subdomains; they don't appear to be core functionality.
"imgur.com",
"www.imgur.com",
"m.imgur.com",
// Instagram
"instagram.com",
"www.instagram.com",
"m.instagram.com",
// Pinterest
"pinterest.com",
"www.pinterest.com",
"pin.it",
// Giphy
"giphy.com",
"media.giphy.com",
"media1.giphy.com",
"media2.giphy.com",
"media3.giphy.com",
"gph.is"
]
// For media domains, we DO NOT require an exact match - subdomains are allowed.
private static let mediaDomainWhitelist = [
// YouTube
"ytimg.com",
// Reddit
"redd.it",
// Imgur
"imgur.com",
// Instagram
"cdninstagram.com",
"fbcdn.net",
// Pinterest
"pinimg.com",
// Giphy
"giphy.com"
]
private static let protocolWhitelist = [
"https"
]
@objc
public func displayDomain() -> String? {
return OWSLinkPreview.displayDomain(forUrl: urlString)
@ -383,13 +306,7 @@ public class OWSLinkPreview: MTLModel {
owsFailDebug("Invalid url.")
return nil
}
guard let result = whitelistedDomain(forUrl: url,
domainWhitelist: OWSLinkPreview.linkDomainWhitelist,
allowSubdomains: false) else {
Logger.error("Missing domain.")
return nil
}
return result
return url.host
}
@objc
@ -397,9 +314,7 @@ public class OWSLinkPreview: MTLModel {
guard let url = URL(string: urlString) else {
return false
}
return whitelistedDomain(forUrl: url,
domainWhitelist: OWSLinkPreview.linkDomainWhitelist,
allowSubdomains: false) != nil
return true
}
@objc
@ -407,36 +322,7 @@ public class OWSLinkPreview: MTLModel {
guard let url = URL(string: urlString) else {
return false
}
return whitelistedDomain(forUrl: url,
domainWhitelist: OWSLinkPreview.mediaDomainWhitelist,
allowSubdomains: true) != nil
}
private class func whitelistedDomain(forUrl url: URL, domainWhitelist: [String], allowSubdomains: Bool) -> String? {
guard let urlProtocol = url.scheme?.lowercased() else {
return nil
}
guard protocolWhitelist.contains(urlProtocol) else {
return nil
}
guard let domain = url.host?.lowercased() else {
return nil
}
guard url.path.count > 1 else {
// URL must have non-empty path.
return nil
}
for whitelistedDomain in domainWhitelist {
if domain == whitelistedDomain.lowercased() {
return whitelistedDomain
}
if allowSubdomains,
domain.hasSuffix("." + whitelistedDomain.lowercased()) {
return whitelistedDomain
}
}
return nil
return true
}
// MARK: - Serial Queue
@ -812,31 +698,27 @@ public class OWSLinkPreview: MTLModel {
}
}
// Example:
//
// <meta property="og:title" content="Randomness is Random - Numberphile">
// <meta property="og:image" content="https://i.ytimg.com/vi/tP-Ipsat90c/maxresdefault.jpg">
class func parse(linkData: Data) throws -> OWSLinkPreviewContents {
guard let linkText = String(bytes: linkData, encoding: .utf8) else {
owsFailDebug("Could not parse link text.")
throw LinkPreviewError.invalidInput
}
let content = HTMLMetadata.construct(parsing: linkText)
var title: String?
if let rawTitle = NSRegularExpression.parseFirstMatch(pattern: "<meta\\s+property\\s*=\\s*\"og:title\"\\s+[^>]*content\\s*=\\s*\"(.*?)\"\\s*[^>]*/?>",
text: linkText,
options: .dotMatchesLineSeparators) {
if let decodedTitle = decodeHTMLEntities(inString: rawTitle) {
let normalizedTitle = OWSLinkPreview.normalizeTitle(title: decodedTitle)
if normalizedTitle.count > 0 {
title = normalizedTitle
}
let rawTitle = content.ogTitle ?? content.titleTag
if let decodedTitle = decodeHTMLEntities(inString: rawTitle ?? "") {
let normalizedTitle = OWSLinkPreview.normalizeTitle(title: decodedTitle)
if normalizedTitle.count > 0 {
title = normalizedTitle
}
}
Logger.verbose("title: \(String(describing: title))")
guard let rawImageUrlString = NSRegularExpression.parseFirstMatch(pattern: "<meta\\s+property\\s*=\\s*\"og:image\"\\s+[^>]*content\\s*=\\s*\"(.*?)\"[^>]*/?>", text: linkText) else {
guard let rawImageUrlString = content.ogImageUrlString ?? content.faviconUrlString else {
return OWSLinkPreviewContents(title: title)
}
guard let imageUrlString = decodeHTMLEntities(inString: rawImageUrlString)?.ows_stripped() else {

@ -52,4 +52,57 @@ public extension NSRegularExpression {
let substring = String(text[textRange])
return substring
}
@nonobjc
func firstMatchSet(in searchString: String) -> MatchSet? {
firstMatch(in: searchString, options: [], range: searchString.completeNSRange)?.createMatchSet(originalSearchString: searchString)
}
@nonobjc
func allMatchSets(in searchString: String) -> [MatchSet] {
matches(in: searchString, options: [], range: searchString.completeNSRange).compactMap { $0.createMatchSet(originalSearchString: searchString) }
}
}
public struct MatchSet {
let fullString: Substring
let matchedGroups: [Substring?]
func group(idx: Int) -> Substring? {
guard idx < matchedGroups.count else { return nil }
return matchedGroups[idx]
}
}
fileprivate extension String {
subscript(_ nsRange: NSRange) -> Substring? {
guard let swiftRange = Range(nsRange, in: self) else { return nil }
return self[swiftRange]
}
var completeRange: Range<String.Index> {
startIndex..<endIndex
}
var completeNSRange: NSRange {
NSRange(completeRange, in: self)
}
}
fileprivate extension NSTextCheckingResult {
func createMatchSet(originalSearchString string: String) -> MatchSet? {
guard numberOfRanges > 0 else { return nil }
let substrings = (0..<numberOfRanges)
.map { range(at: $0) }
.map { string[$0] }
guard let fullString = substrings[0] else {
owsFailDebug("Missing expected full string")
return nil
}
return MatchSet(fullString: fullString, matchedGroups: Array(substrings[1...]))
}
}

Loading…
Cancel
Save