mirror of
https://github.com/cranci1/Sora.git
synced 2026-04-21 08:32:00 +00:00
Add Additional HTML based parsing for Modules (#223)
* Add Additional HTML based parsing for Modules * Update JavaScriptCore+Extensions.swift --------- Co-authored-by: cranci <100066266+cranci1@users.noreply.github.com>
This commit is contained in:
parent
e88f58b4b2
commit
e449c047d2
1 changed files with 52 additions and 0 deletions
|
|
@ -333,11 +333,63 @@ extension JSContext {
|
||||||
self.setObject(atobFunction, forKeyedSubscript: "atob" as NSString)
|
self.setObject(atobFunction, forKeyedSubscript: "atob" as NSString)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func setupScrapingUtilities() {
|
||||||
|
let scrapingUtils = """
|
||||||
|
function getElementsByTag(html, tag) {
|
||||||
|
const regex = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`, 'gi');
|
||||||
|
let result = [];
|
||||||
|
let match;
|
||||||
|
while ((match = regex.exec(html)) !== null) {
|
||||||
|
result.push(match[1]);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
function getAttribute(html, tag, attr) {
|
||||||
|
const regex = new RegExp(`<${tag}[^>]*${attr}=[\"']?([^\"' >]+)[\"']?[^>]*>`, 'i');
|
||||||
|
const match = regex.exec(html);
|
||||||
|
return match ? match[1] : null;
|
||||||
|
}
|
||||||
|
function getInnerText(html) {
|
||||||
|
return html.replace(/<[^>]+>/g, '').replace(/\\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
function extractBetween(str, start, end) {
|
||||||
|
const s = str.indexOf(start);
|
||||||
|
if (s === -1) return '';
|
||||||
|
const e = str.indexOf(end, s + start.length);
|
||||||
|
if (e === -1) return '';
|
||||||
|
return str.substring(s + start.length, e);
|
||||||
|
}
|
||||||
|
function stripHtml(html) {
|
||||||
|
return html.replace(/<[^>]+>/g, '');
|
||||||
|
}
|
||||||
|
function normalizeWhitespace(str) {
|
||||||
|
return str.replace(/\\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
function urlEncode(str) {
|
||||||
|
return encodeURIComponent(str);
|
||||||
|
}
|
||||||
|
function urlDecode(str) {
|
||||||
|
try { return decodeURIComponent(str); } catch (e) { return str; }
|
||||||
|
}
|
||||||
|
function htmlEntityDecode(str) {
|
||||||
|
return str.replace(/&([a-zA-Z]+);/g, function(_, entity) {
|
||||||
|
const entities = { quot: '"', apos: "'", amp: '&', lt: '<', gt: '>' };
|
||||||
|
return entities[entity] || _;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function transformResponse(response, fn) {
|
||||||
|
try { return fn(response); } catch (e) { return response; }
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
self.evaluateScript(scrapingUtils)
|
||||||
|
}
|
||||||
|
|
||||||
func setupJavaScriptEnvironment() {
|
func setupJavaScriptEnvironment() {
|
||||||
setupWeirdCode()
|
setupWeirdCode()
|
||||||
setupConsoleLogging()
|
setupConsoleLogging()
|
||||||
setupNativeFetch()
|
setupNativeFetch()
|
||||||
setupFetchV2()
|
setupFetchV2()
|
||||||
setupBase64Functions()
|
setupBase64Functions()
|
||||||
|
setupScrapingUtilities()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue