Ferrite-backup/Ferrite/ViewModels/ScrapingViewModel.swift
kingbri 064a68fbb2 Sources: Add RSS, descriptions, and settings
RSS parsing has been added as a method to parse source since they're
easier on the website's end to parse.

Source settings have been added. The only current setting is the fetch
mode which selects which parser/scraper to use. By default, if an RSS
parser is found, it's selected.

A source now has info shown regarding versioning and authorship. A source
list's repository name and author string are now required.

Signed-off-by: kingbri <bdashore3@gmail.com>
2022-08-31 00:41:38 -04:00

517 lines
19 KiB
Swift

//
// ScrapingViewModel.swift
// Ferrite
//
// Created by Brian Dashore on 7/4/22.
//
import Base32
import Regex
import SwiftSoup
import SwiftUI
public struct SearchResult: Hashable, Codable {
let title: String
let source: String
let size: String
let magnetLink: String
let magnetHash: String?
let seeders: String?
let leechers: String?
}
class ScrapingViewModel: ObservableObject {
@AppStorage("RealDebrid.Enabled") var realDebridEnabled = false
// Link the toast view model for single-directional communication
var toastModel: ToastViewModel?
let byteCountFormatter: ByteCountFormatter = .init()
@Published var searchResults: [SearchResult] = []
@Published var searchText: String = ""
@Published var selectedSearchResult: SearchResult?
@Published var filteredSource: Source?
@MainActor
public func scanSources(sources: [Source]) async {
if sources.isEmpty {
print("Sources empty")
return
}
var tempResults: [SearchResult] = []
for source in sources {
if source.enabled {
// Default to HTML scraping
let preferredParser = SourcePreferredParser(rawValue: source.preferredParser) ?? .none
switch preferredParser {
case .scraping:
if let htmlParser = source.htmlParser {
guard let encodedQuery = searchText.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else {
toastModel?.toastDescription = "Could not process search query, invalid characters present."
print("Could not process search query, invalid characters present")
continue
}
let urlString = source.baseUrl + htmlParser.searchUrl.replacingOccurrences(of: "{query}", with: encodedQuery)
guard let html = await fetchWebsiteData(urlString: urlString) else {
continue
}
let sourceResults = await scrapeHtml(source: source, html: html)
tempResults += sourceResults
}
case .rss:
if let rssParser = source.rssParser {
guard let encodedQuery = searchText.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else {
toastModel?.toastDescription = "Could not process search query, invalid characters present."
print("Could not process search query, invalid characters present")
continue
}
let replacedSearchUrl = rssParser.searchUrl.replacingOccurrences(of: "{query}", with: encodedQuery)
// If there is an RSS base URL, use that instead
var urlString: String
if let rssUrl = rssParser.rssUrl {
urlString = rssUrl + replacedSearchUrl
} else {
urlString = source.baseUrl + replacedSearchUrl
}
guard let rss = await fetchWebsiteData(urlString: urlString) else {
continue
}
let sourceResults = scrapeRss(source: source, rss: rss)
tempResults += sourceResults
}
case .siteApi, .none:
continue
}
}
}
searchResults = tempResults
}
// Fetches the data for a URL
@MainActor
public func fetchWebsiteData(urlString: String) async -> String? {
guard let url = URL(string: urlString) else {
toastModel?.toastDescription = "Source doesn't contain a valid URL, contact the source dev!"
print("Source doesn't contain a valid URL, contact the source dev!")
return nil
}
do {
let (data, _) = try await URLSession.shared.data(from: url)
let html = String(data: data, encoding: .ascii)
return html
} catch {
toastModel?.toastDescription = "Error in fetching data \(error)"
print("Error in fetching data \(error)")
return nil
}
}
// RSS feed scraper
@MainActor
public func scrapeRss(source: Source, rss: String) -> [SearchResult] {
guard let rssParser = source.rssParser else {
return []
}
var tempResults: [SearchResult] = []
var items = Elements()
do {
let document = try SwiftSoup.parse(rss, "", Parser.xmlParser())
items = try document.getElementsByTag("item")
} catch {
toastModel?.toastDescription = "RSS scraping error, couldn't fetch items: \(error)"
print("RSS scraping error, couldn't fetch items: \(error)")
return []
}
for item in items {
// Parse magnet link or translate hash
var magnetHash: String?
if let magnetHashParser = rssParser.magnetHash {
magnetHash = try? runRssComplexQuery(
item: item,
query: magnetHashParser.query,
attribute: magnetHashParser.attribute,
lookupAttribute: magnetHashParser.lookupAttribute,
regexString: magnetHashParser.regex
)
}
var title: String?
if let titleParser = rssParser.title {
title = try? runRssComplexQuery(
item: item,
query: titleParser.query,
attribute: titleParser.attribute,
lookupAttribute: titleParser.lookupAttribute,
regexString: titleParser.regex
)
}
var link: String?
if let magnetLinkParser = rssParser.magnetLink {
link = try? runRssComplexQuery(
item: item,
query: magnetLinkParser.query,
attribute: magnetLinkParser.attribute,
lookupAttribute: magnetLinkParser.lookupAttribute,
regexString: magnetLinkParser.regex
)
} else if let magnetHash = magnetHash {
link = generateMagnetLink(magnetHash: magnetHash, title: title, trackers: rssParser.trackerArray)
} else {
continue
}
guard let href = link, href.starts(with: "magnet:") else {
continue
}
if magnetHash == nil {
magnetHash = fetchMagnetHash(magnetLink: href)
}
var size: String?
if let sizeParser = rssParser.size {
size = try? runRssComplexQuery(
item: item,
query: sizeParser.query,
attribute: sizeParser.attribute,
lookupAttribute: sizeParser.lookupAttribute,
regexString: sizeParser.regex
)
}
if let sizeString = size, let sizeInt = Int64(sizeString) {
size = byteCountFormatter.string(fromByteCount: sizeInt)
}
var seeders: String?
var leechers: String?
if let seederLeecher = rssParser.seedLeech {
if let seederQuery = seederLeecher.seeders {
seeders = try? runRssComplexQuery(
item: item,
query: seederQuery,
attribute: seederLeecher.attribute,
lookupAttribute: seederLeecher.lookupAttribute,
regexString: seederLeecher.seederRegex
)
}
if let leecherQuery = seederLeecher.leechers {
leechers = try? runRssComplexQuery(
item: item,
query: leecherQuery,
attribute: seederLeecher.attribute,
lookupAttribute: seederLeecher.lookupAttribute,
regexString: seederLeecher.leecherRegex
)
}
}
let result = SearchResult(
title: title ?? "No title",
source: source.name,
size: size ?? "",
magnetLink: href,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
tempResults.append(result)
}
return tempResults
}
// HTML scraper
@MainActor
public func scrapeHtml(source: Source, html: String) async -> [SearchResult] {
guard let htmlParser = source.htmlParser else {
return []
}
var rows = Elements()
do {
let document = try SwiftSoup.parse(html)
rows = try document.select(htmlParser.rows)
} catch {
toastModel?.toastDescription = "Scraping error, couldn't fetch rows: \(error)"
print("Scraping error, couldn't fetch rows: \(error)")
return []
}
var tempResults: [SearchResult] = []
// If there's an error, continue instead of returning with nothing
for row in rows {
do {
// Fetches the magnet link
// If the magnet is located on an external page, fetch the external page and grab the magnet link
// External page fetching affects source performance
guard let magnetParser = htmlParser.magnetLink else {
continue
}
var href: String
if let externalMagnetQuery = magnetParser.externalLinkQuery, !externalMagnetQuery.isEmpty {
guard let externalMagnetLink = try row.select(externalMagnetQuery).first()?.attr("href") else {
continue
}
guard let magnetHtml = await fetchWebsiteData(urlString: source.baseUrl + externalMagnetLink) else {
continue
}
let magnetDocument = try SwiftSoup.parse(magnetHtml)
guard let linkResult = try magnetDocument.select(magnetParser.query).first() else {
continue
}
if magnetParser.attribute == "text" {
href = try linkResult.text()
} else {
href = try linkResult.attr(magnetParser.attribute)
}
} else {
guard let link = try runHtmlComplexQuery(
row: row,
query: magnetParser.query,
attribute: magnetParser.attribute,
regexString: magnetParser.regex
) else {
continue
}
href = link
}
if !href.starts(with: "magnet:") {
continue
}
// Fetches the magnet hash
let magnetHash = fetchMagnetHash(magnetLink: href)
// Fetches the episode/movie title
var title: String?
if let titleParser = htmlParser.title {
title = try? runHtmlComplexQuery(
row: row,
query: titleParser.query,
attribute: titleParser.attribute,
regexString: titleParser.regex
)
}
// Fetches the torrent's size
// TODO: Add int translation
var size: String?
if let sizeParser = htmlParser.size {
size = try? runHtmlComplexQuery(
row: row,
query: sizeParser.query,
attribute: sizeParser.attribute,
regexString: sizeParser.regex
)
}
// Fetches seeders and leechers if there are any
var seeders: String?
var leechers: String?
if let seederLeecher = htmlParser.seedLeech {
if let combinedQuery = seederLeecher.combined {
if let combinedString = try? runHtmlComplexQuery(
row: row,
query: combinedQuery,
attribute: seederLeecher.attribute,
regexString: nil
) {
if let seederRegex = seederLeecher.seederRegex, let leecherRegex = seederLeecher.leecherRegex {
// Seeder regex matching
seeders = try? Regex(seederRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
// Leecher regex matching
leechers = try? Regex(leecherRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
}
}
} else {
if let seederQuery = seederLeecher.seeders {
seeders = try? runHtmlComplexQuery(
row: row,
query: seederQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.seederRegex
)
}
if let leecherQuery = seederLeecher.seeders {
leechers = try? runHtmlComplexQuery(
row: row,
query: leecherQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.leecherRegex
)
}
}
}
let result = SearchResult(
title: title ?? "No title",
source: source.name,
size: size ?? "",
magnetLink: href,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
tempResults.append(result)
} catch {
toastModel?.toastDescription = "Scraping error: \(error)"
print("Scraping error: \(error)")
continue
}
}
return tempResults
}
// Complex query parsing for HTML scraping
func runHtmlComplexQuery(row: Element, query: String, attribute: String, regexString: String?) throws -> String? {
var parsedValue: String?
let result = try row.select(query).first()
switch attribute {
case "text":
parsedValue = try result?.text()
default:
parsedValue = try result?.attr(attribute)
}
// A capture group must be used in the provided regex
if let regexString = regexString,
let parsedValue = parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
} else {
return parsedValue
}
}
// Complex query parsing for RSS scraping
func runRssComplexQuery(item: Element, query: String, attribute: String, lookupAttribute: String?, regexString: String?) throws -> String? {
var parsedValue: String?
switch attribute {
case "text":
parsedValue = try item.getElementsByTag(query).first()?.text()
default:
// If there's a key/value to lookup the attribute with, query it. Othewise assume the value is in the same attribute
if let lookupAttribute = lookupAttribute {
let containerElement = try item.getElementsByAttributeValue(lookupAttribute, query).first()
parsedValue = try containerElement?.attr(attribute)
} else {
let containerElement = try item.getElementsByAttribute(attribute).first()
parsedValue = try containerElement?.attr(attribute)
}
}
// A capture group must be used in the provided regex
if let regexString = regexString,
let parsedValue = parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
} else {
return parsedValue
}
}
// Fetches and possibly converts the magnet hash value to sha1
public func fetchMagnetHash(magnetLink: String) -> String? {
guard let firstSplit = magnetLink.split(separator: ":")[safe: 3] else {
return nil
}
guard let magnetHash = firstSplit.split(separator: "&")[safe: 0] else {
return nil
}
// Is this a Base32hex hash?
if magnetHash.count == 32 {
let decryptedMagnetHash = base32DecodeToData(String(magnetHash))
return decryptedMagnetHash?.hexEncodedString()
} else {
return String(magnetHash).lowercased()
}
}
func parseSizeString(sizeString: String) -> String? {
// Test if the string can be a full integer
guard let size = Int(sizeString) else {
return nil
}
let length = sizeString.count
if length > 9 {
// This is a GB
return String("\(Double(size) / 1e9) GB")
} else if length > 6 {
// This is a MB
return String("\(Double(size) / 1e6) MB")
} else if length > 3 {
// This is a KB
return String("\(Double(size) / 1e3) KB")
} else {
return nil
}
}
public func generateMagnetLink(magnetHash: String, title: String?, trackers: [SourceTracker]) -> String {
var magnetLinkArray: [String] = ["magnet:?xt=urn:btih:"]
magnetLinkArray.append(magnetHash)
if let title = title, let encodedTitle = title.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) {
magnetLinkArray.append("&dn=\(encodedTitle)")
}
for tracker in trackers {
if URL(string: tracker.urlString) != nil,
let encodedUrlString = tracker.urlString.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed)
{
magnetLinkArray.append("&tr=\(encodedUrlString)")
}
}
return magnetLinkArray.joined()
}
}