Ferrite-backup/Ferrite/ViewModels/ScrapingViewModel.swift
kingbri 102b59ab0a Scraping: Add source specific errors
State what source errored when one occurs.

Signed-off-by: kingbri <bdashore3@proton.me>
2022-11-19 12:13:29 -05:00

909 lines
34 KiB
Swift

//
// ScrapingViewModel.swift
// Ferrite
//
// Created by Brian Dashore on 7/4/22.
//
import Base32
import Regex
import SwiftSoup
import SwiftUI
import SwiftyJSON
class ScrapingViewModel: ObservableObject {
@AppStorage("RealDebrid.Enabled") var realDebridEnabled = false
// Link the toast view model for single-directional communication
var toastModel: ToastViewModel?
let byteCountFormatter: ByteCountFormatter = .init()
var runningSearchTask: Task<Void, Error>?
@Published var searchResults: [SearchResult] = []
@Published var searchText: String = ""
@Published var filteredSource: Source?
@Published var currentSourceName: String?
@MainActor
func updateSearchResults(newResults: [SearchResult]) {
searchResults = newResults
}
// Utility function to print source specific errors
func sendSourceError(_ description: String, newToastType: ToastViewModel.ToastType? = nil) async {
let newDescription = "\(currentSourceName ?? "No source given"): \(description)"
await toastModel?.updateToastDescription(
newDescription,
newToastType: newToastType
)
print(newDescription)
}
public func scanSources(sources: [Source]) async {
if sources.isEmpty {
await toastModel?.updateToastDescription("There are no sources to search!", newToastType: .info)
print("There are no sources to search!")
return
}
var tempResults: [SearchResult] = []
for source in sources {
if source.enabled {
Task { @MainActor in
currentSourceName = source.name
}
guard let baseUrl = source.baseUrl else {
await toastModel?.updateToastDescription("The base URL could not be found for source \(source.name)")
print("The base URL could not be found for source \(source.name)")
continue
}
// Default to HTML scraping
let preferredParser = SourcePreferredParser(rawValue: source.preferredParser) ?? .none
guard let encodedQuery = searchText.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else {
await sendSourceError("Could not process search query, invalid characters present.")
continue
}
switch preferredParser {
case .scraping:
if let htmlParser = source.htmlParser {
let replacedSearchUrl = htmlParser.searchUrl
.replacingOccurrences(of: "{query}", with: encodedQuery)
let data = await handleUrls(
baseUrl: baseUrl,
replacedSearchUrl: replacedSearchUrl,
fallbackUrls: source.fallbackUrls
)
if let data,
let html = String(data: data, encoding: .utf8)
{
let sourceResults = await scrapeHtml(source: source, baseUrl: baseUrl, html: html)
tempResults += sourceResults
}
}
case .rss:
if let rssParser = source.rssParser {
let replacedSearchUrl = rssParser.searchUrl
.replacingOccurrences(of: "{secret}", with: source.api?.clientSecret?.value ?? "")
.replacingOccurrences(of: "{query}", with: encodedQuery)
// Do not use fallback URLs if the base URL isn't used
let data: Data?
if let rssUrl = rssParser.rssUrl {
data = await fetchWebsiteData(urlString: rssUrl + replacedSearchUrl)
} else {
data = await handleUrls(
baseUrl: baseUrl,
replacedSearchUrl: replacedSearchUrl,
fallbackUrls: source.fallbackUrls
)
}
if let data,
let rss = String(data: data, encoding: .utf8)
{
let sourceResults = await scrapeRss(source: source, rss: rss)
tempResults += sourceResults
}
}
case .siteApi:
if let jsonParser = source.jsonParser {
var replacedSearchUrl = jsonParser.searchUrl
.replacingOccurrences(of: "{query}", with: encodedQuery)
// Handle anything API related including tokens, client IDs, and appending the API URL
// The source API key is for APIs that require extra credentials or use a different URL
if let sourceApi = source.api {
if let clientIdInfo = sourceApi.clientId {
if let newSearchUrl = await handleApiCredential(clientIdInfo,
replacement: "{clientId}",
searchUrl: replacedSearchUrl,
apiUrl: sourceApi.apiUrl,
baseUrl: baseUrl)
{
replacedSearchUrl = newSearchUrl
}
}
// Works exactly the same as the client ID check
if let clientSecretInfo = sourceApi.clientSecret {
if let newSearchUrl = await handleApiCredential(clientSecretInfo,
replacement: "{secret}",
searchUrl: replacedSearchUrl,
apiUrl: sourceApi.apiUrl,
baseUrl: baseUrl)
{
replacedSearchUrl = newSearchUrl
}
}
}
let passedUrl = source.api?.apiUrl ?? baseUrl
let data = await handleUrls(
baseUrl: passedUrl,
replacedSearchUrl: replacedSearchUrl,
fallbackUrls: source.fallbackUrls
)
if let data {
let sourceResults = await scrapeJson(source: source, jsonData: data)
tempResults += sourceResults
}
}
case .none:
continue
}
}
}
// If the task is cancelled, return
if let searchTask = runningSearchTask, searchTask.isCancelled {
return
}
await updateSearchResults(newResults: tempResults)
}
// Checks the base URL for any website data then iterates through the fallback URLs
func handleUrls(baseUrl: String, replacedSearchUrl: String, fallbackUrls: [String]?) async -> Data? {
if let data = await fetchWebsiteData(urlString: baseUrl + replacedSearchUrl) {
return data
}
if let fallbackUrls {
for fallbackUrl in fallbackUrls {
if let data = await fetchWebsiteData(urlString: fallbackUrl + replacedSearchUrl) {
return data
}
}
}
return nil
}
public func handleApiCredential(_ credential: SourceApiCredential,
replacement: String,
searchUrl: String,
apiUrl: String?,
baseUrl: String) async -> String?
{
// Is the credential expired
var isExpired = false
if let timeStamp = credential.timeStamp?.timeIntervalSince1970, credential.expiryLength != 0 {
let now = Date().timeIntervalSince1970
isExpired = now > timeStamp + credential.expiryLength
}
// Fetch a new credential if it's expired or doesn't exist yet
if let value = credential.value, !isExpired {
return searchUrl
.replacingOccurrences(of: replacement, with: value)
} else if
credential.value == nil || isExpired,
let credentialUrl = credential.urlString,
let newValue = await fetchApiCredential(
urlString: (apiUrl ?? baseUrl) + credentialUrl,
credential: credential
)
{
let backgroundContext = PersistenceController.shared.backgroundContext
credential.value = newValue
credential.timeStamp = Date()
PersistenceController.shared.save(backgroundContext)
return searchUrl
.replacingOccurrences(of: replacement, with: newValue)
}
return nil
}
public func fetchApiCredential(urlString: String, credential: SourceApiCredential) async -> String? {
guard let url = URL(string: urlString) else {
Task { @MainActor in
toastModel?.updateToastDescription("This token URL is invalid.")
}
print("Token url \(urlString) is invalid!")
return nil
}
do {
let (data, _) = try await URLSession.shared.data(from: url)
let responseType = ApiCredentialResponseType(rawValue: credential.responseType ?? "") ?? .json
switch responseType {
case .json:
guard let credentialQuery = credential.query else {
return nil
}
let json = try JSON(data: data)
return json[credentialQuery.components(separatedBy: ".")].string
case .text:
return String(data: data, encoding: .utf8)
}
} catch {
let error = error as NSError
switch error.code {
case -999:
await toastModel?.updateToastDescription("Search cancelled", newToastType: .info)
case -1001:
await sendSourceError("Credentials request timed out")
default:
await sendSourceError("Error in fetching an API credential \(error)")
}
return nil
}
}
// Fetches the data for a URL
public func fetchWebsiteData(urlString: String) async -> Data? {
guard let url = URL(string: urlString) else {
await sendSourceError("Source doesn't contain a valid URL, contact the source dev!")
return nil
}
let request = URLRequest(url: url, timeoutInterval: 15)
do {
let (data, _) = try await URLSession.shared.data(for: request)
return data
} catch {
let error = error as NSError
switch error.code {
case -999:
await toastModel?.updateToastDescription("Search cancelled", newToastType: .info)
case -1001:
await sendSourceError("Data request timed out. Trying fallback URLs if present.")
default:
await sendSourceError("Error in fetching website data \(error)")
}
return nil
}
}
public func scrapeJson(source: Source, jsonData: Data) async -> [SearchResult] {
var tempResults: [SearchResult] = []
guard let jsonParser = source.jsonParser else {
return tempResults
}
var jsonResults: [JSON] = []
do {
let json = try JSON(data: jsonData)
if let resultsQuery = jsonParser.results {
jsonResults = json[resultsQuery.components(separatedBy: ".")].arrayValue
} else {
jsonResults = json.arrayValue
}
} catch {
if let api = source.api {
await cleanApiCreds(api: api)
print("JSON parsing error, couldn't fetch results: \(error)")
}
}
// If there are no results and the client secret isn't dynamic, just clear out the token
if let api = source.api, jsonResults.isEmpty {
await cleanApiCreds(api: api)
print("JSON results were empty!")
}
// Iterate through results and grab what we can
for result in jsonResults {
var subResults: [JSON] = []
let searchResult = parseJsonResult(result, jsonParser: jsonParser, source: source)
// If subresults exist, iterate through those as well with the existing result
// Otherwise append the applied result if it exists
// Better to be redundant with checks rather than another for loop or filter
if let subResultsQuery = jsonParser.subResults {
subResults = result[subResultsQuery.components(separatedBy: ".")].arrayValue
for subResult in subResults {
if let newSearchResult =
parseJsonResult(
subResult,
jsonParser: jsonParser,
source: source,
existingSearchResult: searchResult
),
let magnetLink = newSearchResult.magnetLink,
magnetLink.starts(with: "magnet:"),
!tempResults.contains(newSearchResult)
{
tempResults.append(newSearchResult)
}
}
} else if
let searchResult,
let magnetLink = searchResult.magnetLink,
magnetLink.starts(with: "magnet:"),
!tempResults.contains(searchResult)
{
tempResults.append(searchResult)
}
}
return tempResults
}
public func parseJsonResult(_ result: JSON, jsonParser: SourceJsonParser, source: Source, existingSearchResult: SearchResult? = nil) -> SearchResult? {
var magnetHash: String? = existingSearchResult?.magnetHash
if let magnetHashParser = jsonParser.magnetHash {
let rawHash = result[magnetHashParser.query.components(separatedBy: ".")].rawValue
if !(rawHash is NSNull) {
magnetHash = fetchMagnetHash(existingHash: String(describing: rawHash))
}
}
var title: String? = existingSearchResult?.title
if let titleParser = jsonParser.title {
if let existingTitle = existingSearchResult?.title,
let discriminatorQuery = titleParser.discriminator
{
let rawDiscriminator = result[discriminatorQuery.components(separatedBy: ".")].rawValue
if !(rawDiscriminator is NSNull) {
title = String(describing: rawDiscriminator) + existingTitle
}
} else if existingSearchResult?.title == nil {
let rawTitle = result[titleParser.query].rawValue
title = rawTitle is NSNull ? nil : String(describing: rawTitle)
}
}
var link: String? = existingSearchResult?.magnetLink
if let magnetLinkParser = jsonParser.magnetLink, existingSearchResult?.magnetLink == nil {
let rawLink = result[magnetLinkParser.query.components(separatedBy: ".")].rawValue
link = rawLink is NSNull ? nil : String(describing: rawLink)
} else if let magnetHash {
link = generateMagnetLink(magnetHash: magnetHash, title: title, trackers: source.trackers)
}
if magnetHash == nil, let href = link {
magnetHash = fetchMagnetHash(magnetLink: href)
}
var size: String? = existingSearchResult?.size
if let sizeParser = jsonParser.size, existingSearchResult?.size == nil {
let rawSize = result[sizeParser.query.components(separatedBy: ".")].rawValue
size = rawSize is NSNull ? nil : String(describing: rawSize)
}
if let sizeString = size, let sizeInt = Int64(sizeString) {
size = byteCountFormatter.string(fromByteCount: sizeInt)
}
var seeders: String? = existingSearchResult?.seeders
var leechers: String? = existingSearchResult?.leechers
if let seederLeecher = jsonParser.seedLeech {
if let seederQuery = seederLeecher.seeders, existingSearchResult?.seeders == nil {
let rawSeeders = result[seederQuery.components(separatedBy: ".")].rawValue
seeders = rawSeeders is NSNull ? nil : String(describing: rawSeeders)
}
if let leecherQuery = seederLeecher.leechers, existingSearchResult?.leechers == nil {
let rawLeechers = result[leecherQuery.components(separatedBy: ".")].rawValue
leechers = rawLeechers is NSNull ? nil : String(describing: rawLeechers)
}
}
let result = SearchResult(
title: title,
source: source.name,
size: size,
magnetLink: link,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
return result
}
// RSS feed scraper
public func scrapeRss(source: Source, rss: String) async -> [SearchResult] {
var tempResults: [SearchResult] = []
guard let rssParser = source.rssParser else {
return tempResults
}
var items = Elements()
do {
let document = try SwiftSoup.parse(rss, "", Parser.xmlParser())
items = try document.getElementsByTag(rssParser.items)
} catch {
await sendSourceError("RSS scraping error, couldn't fetch items: \(error)")
return tempResults
}
for item in items {
// Parse magnet link or translate hash
var magnetHash: String?
if let magnetHashParser = rssParser.magnetHash {
let tempHash = try? runRssComplexQuery(
item: item,
query: magnetHashParser.query,
attribute: magnetHashParser.attribute,
discriminator: magnetHashParser.discriminator,
regexString: magnetHashParser.regex
)
magnetHash = fetchMagnetHash(existingHash: tempHash)
}
var title: String?
if let titleParser = rssParser.title {
title = try? runRssComplexQuery(
item: item,
query: titleParser.query,
attribute: titleParser.attribute,
discriminator: titleParser.discriminator,
regexString: titleParser.regex
)
}
var link: String?
if let magnetLinkParser = rssParser.magnetLink {
link = try? runRssComplexQuery(
item: item,
query: magnetLinkParser.query,
attribute: magnetLinkParser.attribute,
discriminator: magnetLinkParser.discriminator,
regexString: magnetLinkParser.regex
)
} else if let magnetHash {
link = generateMagnetLink(magnetHash: magnetHash, title: title, trackers: source.trackers)
} else {
continue
}
guard let href = link, href.starts(with: "magnet:") else {
continue
}
if magnetHash == nil {
magnetHash = fetchMagnetHash(magnetLink: href)
}
var size: String?
if let sizeParser = rssParser.size {
size = try? runRssComplexQuery(
item: item,
query: sizeParser.query,
attribute: sizeParser.attribute,
discriminator: sizeParser.discriminator,
regexString: sizeParser.regex
)
}
if let sizeString = size, let sizeInt = Int64(sizeString) {
size = byteCountFormatter.string(fromByteCount: sizeInt)
}
var seeders: String?
var leechers: String?
if let seederLeecher = rssParser.seedLeech {
if let seederQuery = seederLeecher.seeders {
seeders = try? runRssComplexQuery(
item: item,
query: seederQuery,
attribute: seederLeecher.attribute,
discriminator: seederLeecher.discriminator,
regexString: seederLeecher.seederRegex
)
}
if let leecherQuery = seederLeecher.leechers {
leechers = try? runRssComplexQuery(
item: item,
query: leecherQuery,
attribute: seederLeecher.attribute,
discriminator: seederLeecher.discriminator,
regexString: seederLeecher.leecherRegex
)
}
}
let result = SearchResult(
title: title ?? "No title",
source: source.name,
size: size ?? "",
magnetLink: href,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
if !tempResults.contains(result) {
tempResults.append(result)
}
}
return tempResults
}
// Complex query parsing for RSS scraping
func runRssComplexQuery(item: Element, query: String, attribute: String, discriminator: String?, regexString: String?) throws -> String? {
var parsedValue: String?
switch attribute {
case "text":
parsedValue = try item.getElementsByTag(query).first()?.text()
default:
// If there's a key/value to lookup the attribute with, query it. Othewise assume the value is in the same attribute
if let discriminator {
let containerElement = try item.getElementsByAttributeValue(discriminator, query).first()
parsedValue = try containerElement?.attr(attribute)
} else {
let containerElement = try item.getElementsByAttribute(attribute).first()
parsedValue = try containerElement?.attr(attribute)
}
}
// A capture group must be used in the provided regex
if let regexString,
let parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
} else {
return parsedValue
}
}
// HTML scraper
public func scrapeHtml(source: Source, baseUrl: String, html: String) async -> [SearchResult] {
var tempResults: [SearchResult] = []
guard let htmlParser = source.htmlParser else {
return tempResults
}
var rows = Elements()
do {
let document = try SwiftSoup.parse(html)
rows = try document.select(htmlParser.rows)
} catch {
await sendSourceError("Scraping error, couldn't fetch rows: \(error)")
return tempResults
}
// If there's an error, continue instead of returning with nothing
for row in rows {
do {
// Fetches the magnet link
// If the magnet is located on an external page, fetch the external page and grab the magnet link
// External page fetching affects source performance
guard let magnetParser = htmlParser.magnetLink else {
continue
}
var href: String
if let externalMagnetQuery = magnetParser.externalLinkQuery, !externalMagnetQuery.isEmpty {
guard
let externalMagnetLink = try row.select(externalMagnetQuery).first()?.attr("href"),
let data = await fetchWebsiteData(urlString: baseUrl + externalMagnetLink),
let magnetHtml = String(data: data, encoding: .utf8)
else {
continue
}
let magnetDocument = try SwiftSoup.parse(magnetHtml)
guard let linkResult = try magnetDocument.select(magnetParser.query).first() else {
continue
}
if magnetParser.attribute == "text" {
href = try linkResult.text()
} else {
href = try linkResult.attr(magnetParser.attribute)
}
} else {
guard let link = try runHtmlComplexQuery(
row: row,
query: magnetParser.query,
attribute: magnetParser.attribute,
regexString: magnetParser.regex
) else {
continue
}
href = link
}
if !href.starts(with: "magnet:") {
continue
}
// Fetches the magnet hash
let magnetHash = fetchMagnetHash(magnetLink: href)
// Fetches the episode/movie title
var title: String?
if let titleParser = htmlParser.title {
title = try? runHtmlComplexQuery(
row: row,
query: titleParser.query,
attribute: titleParser.attribute,
regexString: titleParser.regex
)
}
// Fetches the torrent's size
// TODO: Add int translation
var size: String?
if let sizeParser = htmlParser.size {
size = try? runHtmlComplexQuery(
row: row,
query: sizeParser.query,
attribute: sizeParser.attribute,
regexString: sizeParser.regex
)
}
// Fetches seeders and leechers if there are any
var seeders: String?
var leechers: String?
if let seederLeecher = htmlParser.seedLeech {
if let combinedQuery = seederLeecher.combined {
if let combinedString = try? runHtmlComplexQuery(
row: row,
query: combinedQuery,
attribute: seederLeecher.attribute,
regexString: nil
) {
if let seederRegex = seederLeecher.seederRegex, let leecherRegex = seederLeecher.leecherRegex {
// Seeder regex matching
seeders = try? Regex(seederRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
// Leecher regex matching
leechers = try? Regex(leecherRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
}
}
} else {
if let seederQuery = seederLeecher.seeders {
seeders = try? runHtmlComplexQuery(
row: row,
query: seederQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.seederRegex
)
}
if let leecherQuery = seederLeecher.seeders {
leechers = try? runHtmlComplexQuery(
row: row,
query: leecherQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.leecherRegex
)
}
}
}
let result = SearchResult(
title: title ?? "No title",
source: source.name,
size: size ?? "",
magnetLink: href,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
if !tempResults.contains(result) {
tempResults.append(result)
}
} catch {
await sendSourceError("Scraping error: \(error)")
continue
}
}
return tempResults
}
// Complex query parsing for HTML scraping
func runHtmlComplexQuery(row: Element, query: String, attribute: String, regexString: String?) throws -> String? {
var parsedValue: String?
let result = try row.select(query).first()
switch attribute {
case "text":
parsedValue = try result?.text()
default:
parsedValue = try result?.attr(attribute)
}
// A capture group must be used in the provided regex
if let regexString,
let parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
} else {
return parsedValue
}
}
// Fetches and possibly converts the magnet hash value to sha1
public func fetchMagnetHash(magnetLink: String? = nil, existingHash: String? = nil) -> String? {
var magnetHash: String
if let existingHash {
magnetHash = existingHash
} else if
let magnetLink,
let firstSplit = magnetLink.split(separator: ":")[safe: 3],
let tempHash = firstSplit.split(separator: "&")[safe: 0]
{
magnetHash = String(tempHash)
} else {
return nil
}
// Is this a Base32hex hash?
if magnetHash.count == 32 {
let decryptedMagnetHash = base32DecodeToData(String(magnetHash))
return decryptedMagnetHash?.hexEncodedString()
} else {
return String(magnetHash).lowercased()
}
}
func parseSizeString(sizeString: String) -> String? {
// Test if the string can be a full integer
guard let size = Int(sizeString) else {
return nil
}
let length = sizeString.count
if length > 9 {
// This is a GB
return String("\(Double(size) / 1e9) GB")
} else if length > 6 {
// This is a MB
return String("\(Double(size) / 1e6) MB")
} else if length > 3 {
// This is a KB
return String("\(Double(size) / 1e3) KB")
} else {
return nil
}
}
public func generateMagnetLink(magnetHash: String, title: String?, trackers: [String]?) -> String {
var magnetLinkArray = ["magnet:?xt=urn:btih:"]
magnetLinkArray.append(magnetHash)
if let title, let encodedTitle = title.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) {
magnetLinkArray.append("&dn=\(encodedTitle)")
}
if let trackers {
for trackerUrl in trackers {
if URL(string: trackerUrl) != nil,
let encodedUrlString = trackerUrl.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed)
{
magnetLinkArray.append("&tr=\(encodedUrlString)")
}
}
}
return magnetLinkArray.joined()
}
func cleanApiCreds(api: SourceApi) async {
let backgroundContext = PersistenceController.shared.backgroundContext
let hasCredentials = api.clientId != nil || api.clientSecret != nil
let clientIdReset: Bool
let clientSecretReset: Bool
var responseArray = ["Could not fetch API results"]
if let clientId = api.clientId, !clientId.dynamic {
clientId.value = nil
clientIdReset = true
} else {
clientIdReset = false
}
if let clientSecret = api.clientSecret, !clientSecret.dynamic {
clientSecret.value = nil
clientSecretReset = true
} else {
clientSecretReset = false
}
if hasCredentials {
responseArray.append("your")
if clientIdReset {
responseArray.append("client ID")
}
if clientIdReset, clientSecretReset {
responseArray.append("and")
}
if clientSecretReset {
responseArray.append("token")
}
responseArray.append("was automatically reset.")
if !(clientIdReset || clientSecretReset) {
responseArray.append("Make sure all credentials are correct in the source's settings!")
}
}
await sendSourceError(responseArray.joined(separator: " "))
PersistenceController.shared.save(backgroundContext)
}
}