Ferrite-backup/Ferrite/ViewModels/ScrapingViewModel.swift
kingbri b7c9c75e3b Ferrite: Overhaul sources
Sources are now completely changed to use a more flexible API. This
uses a fully native source system, so there will be 0 overhead on
resource usage and performance.

JSON objects specify what is fetched and displayed by Ferrite when
searching torrents.

Sources now include sizes, seeders, and leechers for any site that
specifies them.

The versioning and repo naming framework has been added, but will be
displayed in another update.

API support will be included in another update.

Signed-off-by: kingbri <bdashore3@gmail.com>
2022-08-04 21:33:59 -04:00

287 lines
10 KiB
Swift

//
// ScrapingViewModel.swift
// Ferrite
//
// Created by Brian Dashore on 7/4/22.
//
import Base32
import Regex
import SwiftSoup
import SwiftUI
public struct SearchResult: Hashable, Codable {
let title: String
let source: String
let size: String
let magnetLink: String
let magnetHash: String?
let seeders: String?
let leechers: String?
}
class ScrapingViewModel: ObservableObject {
@AppStorage("RealDebrid.Enabled") var realDebridEnabled = false
// Link the toast view model for single-directional communication
var toastModel: ToastViewModel?
@Published var searchResults: [SearchResult] = []
@Published var searchText: String = ""
@Published var selectedSearchResult: SearchResult?
@Published var filteredSource: Source?
@MainActor
public func scanSources(sources: [Source]) async {
if sources.isEmpty {
print("Sources empty")
return
}
var tempResults: [SearchResult] = []
for source in sources {
if source.enabled {
if let htmlParser = source.htmlParser {
guard let encodedQuery = searchText.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) else {
toastModel?.toastDescription = "Could not process search query, invalid characters present."
print("Could not process search query, invalid characters present")
continue
}
let urlString = source.baseUrl + htmlParser.searchUrl.replacingOccurrences(of: "{query}", with: encodedQuery)
guard let html = await fetchWebsiteHtml(urlString: urlString) else {
continue
}
let sourceResults = await scrapeWebsite(source: source, html: html)
tempResults += sourceResults
}
}
}
searchResults = tempResults
}
// Fetches the HTML for a URL
@MainActor
public func fetchWebsiteHtml(urlString: String) async -> String? {
guard let url = URL(string: urlString) else {
toastModel?.toastDescription = "Source doesn't contain a valid URL, contact the source dev!"
print("Source doesn't contain a valid URL, contact the source dev!")
return nil
}
do {
let (data, _) = try await URLSession.shared.data(from: url)
let html = String(data: data, encoding: .ascii)
return html
} catch {
toastModel?.toastDescription = "Error in fetching HTML \(error)"
print("Error in fetching HTML \(error)")
return nil
}
}
// Returns results to UI
// Results must have a link and title, but other parameters aren't required
@MainActor
public func scrapeWebsite(source: Source, html: String) async -> [SearchResult] {
guard let htmlParser = source.htmlParser else {
return []
}
var rows = Elements()
do {
let document = try SwiftSoup.parse(html)
rows = try document.select(htmlParser.rows)
} catch {
toastModel?.toastDescription = "Scraping error, couldn't fetch rows: \(error)"
print("Scraping error, couldn't fetch rows: \(error)")
return []
}
var tempResults: [SearchResult] = []
// If there's an error, continue instead of returning with nothing
for row in rows {
do {
// Fetches the magnet link
// If the magnet is located on an external page, fetch the external page and grab the magnet link
// External page fetching affects source performance
guard let magnetParser = htmlParser.magnet else {
continue
}
var href: String
if let externalMagnetQuery = magnetParser.externalLinkQuery, !externalMagnetQuery.isEmpty {
guard let externalMagnetLink = try row.select(externalMagnetQuery).first()?.attr("href") else {
continue
}
guard let magnetHtml = await fetchWebsiteHtml(urlString: source.baseUrl + externalMagnetLink) else {
continue
}
let magnetDocument = try SwiftSoup.parse(magnetHtml)
guard let linkResult = try magnetDocument.select(magnetParser.query).first() else {
continue
}
if magnetParser.attribute == "text" {
href = try linkResult.text()
} else {
href = try linkResult.attr(magnetParser.attribute)
}
} else {
guard let link = try runComplexQuery(
row: row,
query: magnetParser.query,
attribute: magnetParser.attribute,
regexString: magnetParser.regex
) else {
continue
}
href = link
}
if !href.starts(with: "magnet:") {
continue
}
// Fetches the magnet hash
let magnetHash = fetchMagnetHash(magnetLink: href)
// Fetches the episode/movie title
var title: String?
if let titleParser = htmlParser.title {
title = try? runComplexQuery(
row: row,
query: titleParser.query,
attribute: titleParser.attribute,
regexString: titleParser.regex
)
}
// Fetches the torrent's size
var size: String?
if let sizeParser = htmlParser.size {
size = try? runComplexQuery(
row: row,
query: sizeParser.query,
attribute: sizeParser.attribute,
regexString: sizeParser.regex
)
}
// Fetches seeders and leechers if there are any
var seeders: String?
var leechers: String?
if let seederLeecher = htmlParser.seedLeech {
if let combinedQuery = seederLeecher.combined {
if let combinedString = try? runComplexQuery(
row: row,
query: combinedQuery,
attribute: seederLeecher.attribute,
regexString: nil
) {
if let seederRegex = seederLeecher.seederRegex, let leecherRegex = seederLeecher.leecherRegex {
// Seeder regex matching
seeders = try? Regex(seederRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
// Leecher regex matching
leechers = try? Regex(leecherRegex).firstMatch(in: combinedString)?.groups[safe: 0]?.value
}
}
} else {
if let seederQuery = seederLeecher.seeders {
seeders = try? runComplexQuery(
row: row,
query: seederQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.seederRegex
)
}
if let leecherQuery = seederLeecher.seeders {
leechers = try? runComplexQuery(
row: row,
query: leecherQuery,
attribute: seederLeecher.attribute,
regexString: seederLeecher.leecherRegex
)
}
}
}
let result = SearchResult(
title: title ?? "No title",
source: source.name,
size: size ?? "",
magnetLink: href,
magnetHash: magnetHash,
seeders: seeders,
leechers: leechers
)
tempResults.append(result)
} catch {
toastModel?.toastDescription = "Scraping error: \(error)"
print("Scraping error: \(error)")
continue
}
}
return tempResults
}
func runComplexQuery(row: Element, query: String, attribute: String, regexString: String?) throws -> String? {
var parsedValue: String?
let result = try row.select(query).first()
switch attribute {
case "text":
parsedValue = try result?.text()
default:
parsedValue = try result?.attr(attribute)
}
// A capture group must be used in the provided regex
if let regexString = regexString,
let parsedValue = parsedValue,
let regexValue = try Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
} else {
return parsedValue
}
}
// Fetches and possibly converts the magnet hash value to sha1
public func fetchMagnetHash(magnetLink: String) -> String? {
guard let firstSplit = magnetLink.split(separator: ":")[safe: 3] else {
return nil
}
guard let magnetHash = firstSplit.split(separator: "&")[safe: 0] else {
return nil
}
// Is this a Base32hex hash?
if magnetHash.count == 32 {
let decryptedMagnetHash = base32DecodeToData(String(magnetHash))
return decryptedMagnetHash?.hexEncodedString()
} else {
return String(magnetHash).lowercased()
}
}
}