Sources: Improve regex and require title

Titles are now required as an entry without a title shouldn't be
featured. Support via regex is now added for matching along with
splicing strings via capture groups.

If a capture group isn't present, assume that a contains check
is occurring.

Also migrate back to searchText being located in scrapingModel.

Signed-off-by: kingbri <bdashore3@proton.me>
This commit is contained in:
kingbri 2023-03-31 22:47:13 -04:00
parent 2982c971a8
commit 2cf6e46422
5 changed files with 118 additions and 94 deletions

View file

@ -63,10 +63,10 @@ public struct SourceJsonParserJson: Codable, Hashable, Sendable {
let searchUrl: String
let results: String?
let subResults: String?
let title: SourceComplexQueryJson
let magnetHash: SourceComplexQueryJson?
let magnetLink: SourceComplexQueryJson?
let subName: SourceComplexQueryJson?
let title: SourceComplexQueryJson?
let size: SourceComplexQueryJson?
let sl: SourceSLJson?
}
@ -75,10 +75,10 @@ public struct SourceRssParserJson: Codable, Hashable, Sendable {
let rssUrl: String?
let searchUrl: String
let items: String
let title: SourceComplexQueryJson
let magnetHash: SourceComplexQueryJson?
let magnetLink: SourceComplexQueryJson?
let subName: SourceComplexQueryJson?
let title: SourceComplexQueryJson?
let size: SourceComplexQueryJson?
let sl: SourceSLJson?
}
@ -86,9 +86,9 @@ public struct SourceRssParserJson: Codable, Hashable, Sendable {
public struct SourceHtmlParserJson: Codable, Hashable, Sendable {
let searchUrl: String
let rows: String
let title: SourceComplexQueryJson
let magnet: SourceMagnetJson
let subName: SourceComplexQueryJson?
let title: SourceComplexQueryJson?
let size: SourceComplexQueryJson?
let sl: SourceSLJson?
}

View file

@ -584,14 +584,12 @@ public class PluginManager: ObservableObject {
newSourceJsonParser.subName = newSourceSubName
}
if let titleJson = jsonParserJson.title {
let newSourceTitle = SourceTitle(context: backgroundContext)
newSourceTitle.query = titleJson.query
newSourceTitle.attribute = titleJson.attribute ?? "text"
newSourceTitle.discriminator = titleJson.discriminator
let newSourceTitle = SourceTitle(context: backgroundContext)
newSourceTitle.query = jsonParserJson.title.query
newSourceTitle.attribute = jsonParserJson.title.attribute ?? "text"
newSourceTitle.discriminator = jsonParserJson.title.discriminator
newSourceJsonParser.title = newSourceTitle
}
newSourceJsonParser.title = newSourceTitle
if let sizeJson = jsonParserJson.size {
let newSourceSize = SourceSize(context: backgroundContext)
@ -656,15 +654,13 @@ public class PluginManager: ObservableObject {
newSourceRssParser.subName = newSourceSubName
}
if let titleJson = rssParserJson.title {
let newSourceTitle = SourceTitle(context: backgroundContext)
newSourceTitle.query = titleJson.query
newSourceTitle.attribute = titleJson.attribute ?? "text"
newSourceTitle.discriminator = titleJson.discriminator
newSourceTitle.regex = titleJson.regex
let newSourceTitle = SourceTitle(context: backgroundContext)
newSourceTitle.query = rssParserJson.title.query
newSourceTitle.attribute = rssParserJson.title.attribute ?? "text"
newSourceTitle.discriminator = rssParserJson.title.discriminator
newSourceTitle.regex = rssParserJson.title.regex
newSourceRssParser.title = newSourceTitle
}
newSourceRssParser.title = newSourceTitle
if let sizeJson = rssParserJson.size {
let newSourceSize = SourceSize(context: backgroundContext)
@ -708,15 +704,13 @@ public class PluginManager: ObservableObject {
newSourceHtmlParser.subName = newSourceSubName
}
// Adds a title complex query if present
if let titleJson = htmlParserJson.title {
let newSourceTitle = SourceTitle(context: backgroundContext)
newSourceTitle.query = titleJson.query
newSourceTitle.attribute = titleJson.attribute ?? "text"
newSourceTitle.regex = titleJson.regex
// Adds a title complex query
let newSourceTitle = SourceTitle(context: backgroundContext)
newSourceTitle.query = htmlParserJson.title.query
newSourceTitle.attribute = htmlParserJson.title.attribute ?? "text"
newSourceTitle.regex = htmlParserJson.title.regex
newSourceHtmlParser.title = newSourceTitle
}
newSourceHtmlParser.title = newSourceTitle
// Adds a size complex query if present
if let sizeJson = htmlParserJson.size {

View file

@ -22,6 +22,7 @@ class ScrapingViewModel: ObservableObject {
runningSearchTask = nil
}
@Published var searchText: String = ""
@Published var searchResults: [SearchResult] = []
// Only add results with valid magnet hashes to the search results array
@ -67,7 +68,7 @@ class ScrapingViewModel: ObservableObject {
await logManager?.error(description, showToast: false)
}
public func scanSources(sources: [Source], searchText: String, debridManager: DebridManager) async {
public func scanSources(sources: [Source], debridManager: DebridManager) async {
await logManager?.info("Started scanning sources for query \"\(searchText)\"")
if sources.isEmpty {
@ -101,7 +102,7 @@ class ScrapingViewModel: ObservableObject {
if source.enabled {
group.addTask {
await self.updateCurrentSourceNames(source.name)
let requestResult = await self.executeParser(source: source, searchText: searchText)
let requestResult = await self.executeParser(source: source)
return (requestResult, source.name)
}
@ -142,7 +143,7 @@ class ScrapingViewModel: ObservableObject {
}
}
func executeParser(source: Source, searchText: String) async -> SearchRequestResult? {
func executeParser(source: Source) async -> SearchRequestResult? {
guard let baseUrl = source.baseUrl else {
await logManager?.error("Scraping: The base URL could not be found for source \(source.name)")
@ -364,7 +365,7 @@ class ScrapingViewModel: ObservableObject {
// Fetches the data for a URL
public func fetchWebsiteData(urlString: String, sourceName: String) async -> Data? {
guard let url = URL(string: urlString) else {
guard let url = URL(string: urlString.trimmingCharacters(in: .whitespacesAndNewlines)) else {
await sendSourceError("\(sourceName): Source doesn't contain a valid URL, contact the source dev!")
return nil
@ -467,11 +468,36 @@ class ScrapingViewModel: ObservableObject {
return SearchRequestResult(results: tempResults, magnets: magnets)
}
// TODO: Add regex parsing for API
public func parseJsonResult(_ result: JSON,
jsonParser: SourceJsonParser,
source: Source,
existingSearchResult: SearchResult? = nil) -> SearchResult?
{
// Enforce these parsers
guard let titleParser = jsonParser.title else {
return nil
}
var title: String? = existingSearchResult?.title
if let existingTitle = title,
let discriminatorQuery = titleParser.discriminator
{
let rawDiscriminator = result[discriminatorQuery.components(separatedBy: ".")].rawValue
if !(rawDiscriminator is NSNull) {
title = String(describing: rawDiscriminator) + existingTitle
}
} else if title == nil {
let rawTitle = result[titleParser.query].rawValue
title = rawTitle is NSNull ? nil : String(describing: rawTitle)
}
// Return if a title doesn't exist
if title == nil {
return nil
}
var magnetHash: String? = existingSearchResult?.magnet.hash
if let magnetHashParser = jsonParser.magnetHash {
let rawHash = result[magnetHashParser.query.components(separatedBy: ".")].rawValue
@ -487,23 +513,7 @@ class ScrapingViewModel: ObservableObject {
link = rawLink is NSNull ? nil : String(describing: rawLink)
}
var title: String? = existingSearchResult?.title
if let titleParser = jsonParser.title {
if let existingTitle = existingSearchResult?.title,
let discriminatorQuery = titleParser.discriminator
{
let rawDiscriminator = result[discriminatorQuery.components(separatedBy: ".")].rawValue
if !(rawDiscriminator is NSNull) {
title = String(describing: rawDiscriminator) + existingTitle
}
} else if existingSearchResult?.title == nil {
let rawTitle = result[titleParser.query].rawValue
title = rawTitle is NSNull ? nil : String(describing: rawTitle)
}
}
// Return if no magnet hash exists
// Return if a magnet hash doesn't exist
let magnet = Magnet(hash: magnetHash, link: link, title: title, trackers: source.trackers)
if magnet.hash == nil {
return nil
@ -573,6 +583,21 @@ class ScrapingViewModel: ObservableObject {
var magnets: [Magnet] = []
for item in items {
// Enforce these parsers
guard let titleParser = rssParser.title else {
continue
}
guard let title = try? runRssComplexQuery(
item: item,
query: titleParser.query,
attribute: titleParser.attribute,
discriminator: titleParser.discriminator,
regexString: titleParser.regex
) else {
continue
}
// Parse magnet link or translate hash
var magnetHash: String?
if let magnetHashParser = rssParser.magnetHash {
@ -596,17 +621,6 @@ class ScrapingViewModel: ObservableObject {
)
}
var title: String?
if let titleParser = rssParser.title {
title = try? runRssComplexQuery(
item: item,
query: titleParser.query,
attribute: titleParser.attribute,
discriminator: titleParser.discriminator,
regexString: titleParser.regex
)
}
// Fetches the subName for the source if there is one
var subName: String?
if let subNameParser = rssParser.subName {
@ -666,7 +680,7 @@ class ScrapingViewModel: ObservableObject {
}
let result = SearchResult(
title: title ?? "No title",
title: title,
source: subName.map { "\(source.name) - \($0)" } ?? source.name,
size: size ?? "",
magnet: magnet,
@ -708,10 +722,8 @@ class ScrapingViewModel: ObservableObject {
// A capture group must be used in the provided regex
if let regexString,
let parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
let parsedValue {
return runRegex(parsedValue: parsedValue, regexString: regexString)
} else {
return parsedValue
}
@ -740,18 +752,37 @@ class ScrapingViewModel: ObservableObject {
// If there's an error, continue instead of returning with nothing
for row in rows {
do {
// Fetches the magnet link
// If the magnet is located on an external page, fetch the external page and grab the magnet link
// External page fetching affects source performance
guard let magnetParser = htmlParser.magnetLink else {
// Enforce these parsers
guard
let magnetParser = htmlParser.magnetLink,
let titleParser = htmlParser.title
else {
continue
}
// Fetches the episode/movie title
// Place here for filtering purposes
guard let title = try? runHtmlComplexQuery(
row: row,
query: titleParser.query,
attribute: titleParser.attribute,
regexString: titleParser.regex
) else {
continue
}
// Fetches the magnet link
// If the magnet is located on an external page, fetch the external page and grab the magnet link
// External page fetching affects source performance
var href: String
if let externalMagnetQuery = magnetParser.externalLinkQuery, !externalMagnetQuery.isEmpty {
guard let externalMagnetUrl = try row.select(externalMagnetQuery).first()?.attr("href") else {
continue
}
let replacedMagnetUrl = externalMagnetUrl.starts(with: "/") ? baseUrl + externalMagnetUrl : externalMagnetUrl
guard
let externalMagnetLink = try row.select(externalMagnetQuery).first()?.attr("href"),
let data = await fetchWebsiteData(urlString: baseUrl + externalMagnetLink, sourceName: source.name),
let data = await fetchWebsiteData(urlString: replacedMagnetUrl, sourceName: source.name),
let magnetHtml = String(data: data, encoding: .utf8)
else {
continue
@ -786,17 +817,6 @@ class ScrapingViewModel: ObservableObject {
continue
}
// Fetches the episode/movie title
var title: String?
if let titleParser = htmlParser.title {
title = try? runHtmlComplexQuery(
row: row,
query: titleParser.query,
attribute: titleParser.attribute,
regexString: titleParser.regex
)
}
var subName: String?
if let subNameParser = htmlParser.subName {
subName = try? runHtmlComplexQuery(
@ -859,7 +879,7 @@ class ScrapingViewModel: ObservableObject {
}
let result = SearchResult(
title: title ?? "No title",
title: title,
source: subName.map { "\(source.name) - \($0)" } ?? source.name,
size: size ?? "",
magnet: magnet,
@ -898,12 +918,25 @@ class ScrapingViewModel: ObservableObject {
parsedValue = try result?.attr(attribute)
}
// A capture group must be used in the provided regex
if let regexString,
let parsedValue,
let regexValue = try? Regex(regexString).firstMatch(in: parsedValue)?.groups[safe: 0]?.value
{
return regexValue
if let parsedValue,
let regexString {
return runRegex(parsedValue: parsedValue, regexString: regexString)
} else {
return parsedValue
}
}
func runRegex(parsedValue: String, regexString: String) -> String? {
let replacedRegexString = regexString
.replacingOccurrences(of: "{query}", with: searchText)
guard let matchedRegex = try? Regex(replacedRegexString).firstMatch(in: parsedValue) else {
return nil
}
// Is there a capture group present? Otherwise return the original matched string
if let group = matchedRegex.groups[safe: 0] {
return group.value
} else {
return parsedValue
}

View file

@ -16,8 +16,6 @@ struct SearchResultsView: View {
@AppStorage("Behavior.UsesRandomSearchText") var usesRandomSearchText: Bool = false
@Binding var searchText: String
@Binding var searchPrompt: String
@State private var lastSearchPromptIndex: Int = -1
let searchBarTextArray: [String] = [
@ -38,7 +36,7 @@ struct SearchResultsView: View {
.onAppear {
searchPrompt = getSearchPrompt()
}
.onChange(of: searchText) { newText in
.onChange(of: scrapingModel.searchText) { newText in
if newText.isEmpty, isSearching {
searchPrompt = getSearchPrompt()
}

View file

@ -16,18 +16,18 @@ struct ContentView: View {
@AppStorage("Behavior.AutocorrectSearch") var autocorrectSearch: Bool = false
@State private var searchText: String = ""
// TODO: Fix searchPrompt updating
@State private var searchPrompt: String = "Search"
var body: some View {
NavView {
List {
SearchResultsView(searchText: $searchText, searchPrompt: $searchPrompt)
SearchResultsView(searchPrompt: $searchPrompt)
}
.listStyle(.insetGrouped)
.inlinedList(inset: 20)
.navigationTitle("Search")
.searchable(text: $searchText, placement: .navigationBarDrawer(displayMode: .always), prompt: Text(searchPrompt))
.searchable(text: $scrapingModel.searchText, placement: .navigationBarDrawer(displayMode: .always), prompt: Text(searchPrompt))
.onSubmit(of: .search) {
if let runningSearchTask = scrapingModel.runningSearchTask, runningSearchTask.isCancelled {
scrapingModel.runningSearchTask = nil
@ -38,7 +38,6 @@ struct ContentView: View {
let sources = pluginManager.fetchInstalledSources()
await scrapingModel.scanSources(
sources: sources,
searchText: searchText,
debridManager: debridManager
)