Ferrite-backup/Ferrite/ViewModels/ScrapingViewModel.swift
kingbri 44e4f74258 Searching: Cleanup existing searches
If a user searched after cancelling the search the first time,
the first search would still continue.

Assign the search task to navigation view and automatically cancel
it and dismiss the searchbar when the user switches to a different
tab.

Also add a ProgressView to show which source is being parsed.

Signed-off-by: kingbri <bdashore3@gmail.com>
2022-08-09 12:29:18 -04:00

539 lines
20 KiB
Swift

//
// ScrapingViewModel.swift
// Ferrite
//
// Created by Brian Dashore on 7/4/22.
//
import Base32
import Regex
import SwiftSoup
import SwiftUI
public struct SearchResult: Hashable, Codable {
let title: String
let source: String
let size: String
let magnetLink: String
let magnetHash: String?
let seeders: String?
let leechers: String?
}
class ScrapingViewModel: ObservableObject {
@AppStorage("RealDebrid.Enabled") var realDebridEnabled = false
// Link the toast view model for single-directional communication
var toastModel: ToastViewModel?
let byteCountFormatter: ByteCountFormatter = .init()
@Published var runningSearchTask: Task<Void, Error>?
@Published var searchResults: [SearchResult] = []
@Published var searchText: String = ""
@Published var selectedSearchResult: SearchResult?
@Published var filteredSource: Source?
@Published var currentSourceName: String?
@MainActor
public func scanSources(sources: [Source]) async {
if sources.isEmpty {
Task { @MainActor in
toastModel?.toastType = .info
toastModel?.toastDescription = "There are no sources to search!"
}
print("Sources empty")
return
}
var tempResults: [SearchResult] = []
for source in sources {
if source.enabled {
currentSourceName = source.name
// Default to HTML scraping
let preferredParser = SourcePreferredParser(rawValue: source.preferredParser) ?? .none
switch preferredParser {
case .scraping:
if let htmlParser = source.htmlParser {
guard let encodedQuery = searchText.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else {
toastModel?.toastDescription = "Could not process search query, invalid characters present."
print("Could not process search query, invalid characters present")
continue
}
let urlString = source.baseUrl + htmlParser.searchUrl.replacingOccurrences(of: "{query}", with: encodedQuery)
guard let html = await fetchWebsiteData(urlString: urlString) else {
continue
}
let sourceResults = await scrapeHtml(source: source, html: html)
tempResults += sourceResults
}
case .rss:
if let rssParser = source.rssParser {
guard let encodedQuery = searchText.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else {
toastModel?.toastDescription = "Could not process search query, invalid characters present."
print("Could not process search query, invalid characters present")
continue
}
let replacedSearchUrl = rssParser.searchUrl.replacingOccurrences(of: "{query}", with: encodedQuery)
// If there is an RSS base URL, use that instead
var urlString: String
if let rssUrl = rssParser.rssUrl {
urlString = rssUrl + replacedSearchUrl
} else {
urlString = source.baseUrl + replacedSearchUrl
}
guard let rss = await fetchWebsiteData(urlString: urlString) else {
continue
}
let sourceResults = scrapeRss(source: source, rss: rss)
tempResults += sourceResults
}
case .siteApi, .none:
continue
}
}
}
// If the task is cancelled, return
if let searchTask = runningSearchTask, searchTask.isCancelled {
return
}
searchResults = tempResults
}
// Fetches the data for a URL
@MainActor
public func fetchWebsiteData(urlString: String) async -> String? {
guard let url = URL(string: urlString) else {
toastModel?.toastDescription = "Source doesn't contain a valid URL, contact the source dev!"
print("Source doesn't contain a valid URL, contact the source dev!")
return nil
}
do {
let (data, _) = try await URLSession.shared.data(from: url)
let html = String(data: data, encoding: .ascii)
return html
} catch {
let error = error as NSError
switch error.code {
case -999:
toastModel?.toastType = .info
toastModel?.toastDescription = "Search cancelled"
default:
toastModel?.toastDescription = "Error in fetching data \(error)"
}
print("Error in fetching data \(error)")
return nil
}
}
// RSS feed scraper
@MainActor
public func scrapeRss(source: Source, rss: String) -> [SearchResult] {
guard let rssParser = source.rssParser else {
return []
}
var tempResults: [SearchResult] = []
var items = Elements()
do {
let document = try SwiftSoup.parse(rss, "", Parser.xmlParser())
items = try document.getElementsByTag("item")
} catch {
toastModel?.toastDescription = "RSS scraping error, couldn't fetch items: \(error)"
print("RSS scraping error, couldn't fetch items: \(error)")
return []
}
for item in items {
// Parse magnet link or translate hash
var magnetHash: String?
if let magnetHashParser = rssParser.magnetHash {
magnetHash = try? runRssComplexQuery(
item: item,
query: magnetHashParser.query,
attribute: magnetHashParser.attribute,
lookupAttribute: magnetHashParser.lookupAttribute,
regexString: magnetHashParser.regex
)
}
var title: String?
if let titleParser = rssParser.title {
title = try? runRssComplexQuery(
item: item,
query: titleParser.query,
attribute: titleParser.attribute,
lookupAttribute: titleParser.lookupAttribute,
regexString: titleParser.regex
)
}
var link: String?
if let magnetLinkParser = rssParser.magnetLink {
link = try? runRssComplexQuery(
item: item,
query: magnetLinkParser.query,
attribute: magnetLinkParser.attribute,
lookupAttribute: magnetLinkParser.lookupAttribute,
regexString: magnetLinkParser.regex
)
} else if let magnetHash = magnetHash {
link = generateMagnetLink(magnetHash: magnetHash, title: title, trackers: rssParser.trackerArray)
} else {
continue
}
guard let href = link, href.starts(with: "magnet:") else {
continue
}
if magnetHash == nil {
magnetHash = fetchMagnetHash(magnetLink: href)
}
var size: String?
if let sizeParser = rssParser.size {
size = try? runRssComplexQuery(
item: item,
query: sizeParser.query,
attribute: sizeParser.attribute,
lookupAttribute: sizeParser.lookupAttribute,
regexString: sizeParser.regex
)
}
if let sizeString = size, let sizeInt = Int64(sizeString) {
size = byteCountFormatter.string(fromByteCount: sizeInt)
}
var seeders: String?
var leechers: String?
if let seederLeecher = rssParser.seedLeech {
if let seederQuery = seederLeecher.seeders {
seeders = try? runRssComplexQuery(
item: item,
query: seederQuery,
attribute: seederLeecher.attribute,
lookupAttribute: seederLeecher.lookupAttribute,
regexString: seederLeecher.seederRegex
)
}
if let leecherQuery = seederLeecher.leechers {
leechers = try? runRssComplexQuery(
item: item,
query: leecherQuery,
attribute: seederLeecher.attribute,
lookupAttribute: seederLeecher.lookupAttribute,
regexString: seederLeecher.leecherRegex
)
}
}
let result = SearchResult(
title: title ?? "No title",
source: source.name,
size: size ?? "",
magnetLink: href,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
tempResults.append(result)
}
return tempResults
}
// HTML scraper
@MainActor
public func scrapeHtml(source: Source, html: String) async -> [SearchResult] {
guard let htmlParser = source.htmlParser else {
return []
}
var rows = Elements()
do {
let document = try SwiftSoup.parse(html)
rows = try document.select(htmlParser.rows)
} catch {
toastModel?.toastDescription = "Scraping error, couldn't fetch rows: \(error)"
print("Scraping error, couldn't fetch rows: \(error)")
return []
}
var tempResults: [SearchResult] = []
// If there's an error, continue instead of returning with nothing
for row in rows {
do {
// Fetches the magnet link
// If the magnet is located on an external page, fetch the external page and grab the magnet link
// External page fetching affects source performance
guard let magnetParser = htmlParser.magnetLink else {
continue
}
var href: String
if let externalMagnetQuery = magnetParser.externalLinkQuery, !externalMagnetQuery.isEmpty {
guard let externalMagnetLink = try row.select(externalMagnetQuery).first()?.attr("href") else {
continue
}
guard let magnetHtml = await fetchWebsiteData(urlString: source.baseUrl + externalMagnetLink) else {
continue
}
let magnetDocument = try SwiftSoup.parse(magnetHtml)
guard let linkResult = try magnetDocument.select(magnetParser.query).first() else {
continue
}
if magnetParser.attribute == "text" {
href = try linkResult.text()
} else {
href = try linkResult.attr(magnetParser.attribute)
}
} else {
guard let link = try runHtmlComplexQuery(
row: row,
query: magnetParser.query,
attribute: magnetParser.attribute,
regexString: magnetParser.regex
) else {
continue
}
href = link
}
if !href.starts(with: "magnet:") {
continue
}
// Fetches the magnet hash
let magnetHash = fetchMagnetHash(magnetLink: href)
// Fetches the episode/movie title
var title: String?
if let titleParser = htmlParser.title {
title = try? runHtmlComplexQuery(
row: row,
query: titleParser.query,
attribute: titleParser.attribute,
regexString: titleParser.regex
)
}
// Fetches the torrent's size
// TODO: Add int translation
var size: String?
if let sizeParser = htmlParser.size {
size = try? runHtmlComplexQuery(
row: row,
query: sizeParser.query,
attribute: sizeParser.attribute,
regexString: sizeParser.regex
)
}
// Fetches seeders and leechers if there are any
var seeders: String?
var leechers: String?
if let seederLeecher = htmlParser.seedLeech {
if let combinedQuery = seederLeecher.combined {
if let combinedString = try? runHtmlComplexQuery(
row: row,
query: combinedQuery,
attribute: seederLeecher.attribute,
regexString: nil
) {
if let seederRegex = seederLeecher.seederRegex, let leecherRegex = seederLeecher.leecherRegex {
// Seeder regex matching
seeders = try? Regex(seederRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
// Leecher regex matching
leechers = try? Regex(leecherRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
}
}
} else {
if let seederQuery = seederLeecher.seeders {
seeders = try? runHtmlComplexQuery(
row: row,
query: seederQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.seederRegex
)
}
if let leecherQuery = seederLeecher.seeders {
leechers = try? runHtmlComplexQuery(
row: row,
query: leecherQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.leecherRegex
)
}
}
}
let result = SearchResult(
title: title ?? "No title",
source: source.name,
size: size ?? "",
magnetLink: href,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
tempResults.append(result)
} catch {
toastModel?.toastDescription = "Scraping error: \(error)"
print("Scraping error: \(error)")
continue
}
}
return tempResults
}
// Complex query parsing for HTML scraping
func runHtmlComplexQuery(row: Element, query: String, attribute: String, regexString: String?) throws -> String? {
var parsedValue: String?
let result = try row.select(query).first()
switch attribute {
case "text":
parsedValue = try result?.text()
default:
parsedValue = try result?.attr(attribute)
}
// A capture group must be used in the provided regex
if let regexString = regexString,
let parsedValue = parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
} else {
return parsedValue
}
}
// Complex query parsing for RSS scraping
func runRssComplexQuery(item: Element, query: String, attribute: String, lookupAttribute: String?, regexString: String?) throws -> String? {
var parsedValue: String?
switch attribute {
case "text":
parsedValue = try item.getElementsByTag(query).first()?.text()
default:
// If there's a key/value to lookup the attribute with, query it. Othewise assume the value is in the same attribute
if let lookupAttribute = lookupAttribute {
let containerElement = try item.getElementsByAttributeValue(lookupAttribute, query).first()
parsedValue = try containerElement?.attr(attribute)
} else {
let containerElement = try item.getElementsByAttribute(attribute).first()
parsedValue = try containerElement?.attr(attribute)
}
}
// A capture group must be used in the provided regex
if let regexString = regexString,
let parsedValue = parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
} else {
return parsedValue
}
}
// Fetches and possibly converts the magnet hash value to sha1
public func fetchMagnetHash(magnetLink: String) -> String? {
guard let firstSplit = magnetLink.split(separator: ":")[safe: 3] else {
return nil
}
guard let magnetHash = firstSplit.split(separator: "&")[safe: 0] else {
return nil
}
// Is this a Base32hex hash?
if magnetHash.count == 32 {
let decryptedMagnetHash = base32DecodeToData(String(magnetHash))
return decryptedMagnetHash?.hexEncodedString()
} else {
return String(magnetHash).lowercased()
}
}
func parseSizeString(sizeString: String) -> String? {
// Test if the string can be a full integer
guard let size = Int(sizeString) else {
return nil
}
let length = sizeString.count
if length > 9 {
// This is a GB
return String("\(Double(size) / 1e9) GB")
} else if length > 6 {
// This is a MB
return String("\(Double(size) / 1e6) MB")
} else if length > 3 {
// This is a KB
return String("\(Double(size) / 1e3) KB")
} else {
return nil
}
}
public func generateMagnetLink(magnetHash: String, title: String?, trackers: [SourceTracker]) -> String {
var magnetLinkArray = ["magnet:?xt=urn:btih:"]
magnetLinkArray.append(magnetHash)
if let title = title, let encodedTitle = title.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) {
magnetLinkArray.append("&dn=\(encodedTitle)")
}
for tracker in trackers {
if URL(string: tracker.urlString) != nil,
let encodedUrlString = tracker.urlString.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed)
{
magnetLinkArray.append("&tr=\(encodedUrlString)")
}
}
return magnetLinkArray.joined()
}
}