Merge pull request #9 from FifthWit/master

letterboxd scraping
This commit is contained in:
FifthWit 2025-06-03 13:28:14 -05:00 committed by GitHub
commit e8ea9e0c54
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 418 additions and 3 deletions

273
package-lock.json generated
View file

@ -1,11 +1,14 @@
{
"name": "open-backend",
"name": "backend",
"version": "2.0.1",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"version": "2.0.1",
"dependencies": {
"@prisma/client": "^6.4.1",
"cheerio": "^1.0.0",
"dotenv": "^16.4.7",
"jsonwebtoken": "^9.0.2",
"prom-client": "^15.1.3",
@ -2588,6 +2591,12 @@
"node": ">=18"
}
},
"node_modules/boolbase": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==",
"license": "ISC"
},
"node_modules/brace-expansion": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz",
@ -2834,6 +2843,48 @@
"node": ">=8"
}
},
"node_modules/cheerio": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/cheerio/-/cheerio-1.0.0.tgz",
"integrity": "sha512-quS9HgjQpdaXOvsZz82Oz7uxtXiy6UIsIQcpBj7HRw2M63Skasm9qlDocAM7jNuaxdhpPU7c4kJN+gA5MCu4ww==",
"license": "MIT",
"dependencies": {
"cheerio-select": "^2.1.0",
"dom-serializer": "^2.0.0",
"domhandler": "^5.0.3",
"domutils": "^3.1.0",
"encoding-sniffer": "^0.2.0",
"htmlparser2": "^9.1.0",
"parse5": "^7.1.2",
"parse5-htmlparser2-tree-adapter": "^7.0.0",
"parse5-parser-stream": "^7.1.2",
"undici": "^6.19.5",
"whatwg-mimetype": "^4.0.0"
},
"engines": {
"node": ">=18.17"
},
"funding": {
"url": "https://github.com/cheeriojs/cheerio?sponsor=1"
}
},
"node_modules/cheerio-select": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/cheerio-select/-/cheerio-select-2.1.0.tgz",
"integrity": "sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==",
"license": "BSD-2-Clause",
"dependencies": {
"boolbase": "^1.0.0",
"css-select": "^5.1.0",
"css-what": "^6.1.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/chokidar": {
"version": "4.0.3",
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz",
@ -3228,6 +3279,34 @@
"uncrypto": "^0.1.3"
}
},
"node_modules/css-select": {
"version": "5.1.0",
"resolved": "https://registry.npmjs.org/css-select/-/css-select-5.1.0.tgz",
"integrity": "sha512-nwoRF1rvRRnnCqqY7updORDsuqKzqYJ28+oSMaJMMgOauh3fvwHqMS7EZpIPqK8GL+g9mKxF1vP/ZjSeNjEVHg==",
"license": "BSD-2-Clause",
"dependencies": {
"boolbase": "^1.0.0",
"css-what": "^6.1.0",
"domhandler": "^5.0.2",
"domutils": "^3.0.1",
"nth-check": "^2.0.1"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/css-what": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/css-what/-/css-what-6.1.0.tgz",
"integrity": "sha512-HTUrgRJ7r4dsZKU6GjmpfRK1O76h97Z8MfS1G0FozR+oF2kG6Vfe8JE6zwrkbxigziPHinCJ+gCPjA9EaBDtRw==",
"license": "BSD-2-Clause",
"engines": {
"node": ">= 6"
},
"funding": {
"url": "https://github.com/sponsors/fb55"
}
},
"node_modules/db0": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/db0/-/db0-0.3.1.tgz",
@ -3399,6 +3478,61 @@
"node": ">=8"
}
},
"node_modules/dom-serializer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz",
"integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==",
"license": "MIT",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.2",
"entities": "^4.2.0"
},
"funding": {
"url": "https://github.com/cheeriojs/dom-serializer?sponsor=1"
}
},
"node_modules/domelementtype": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz",
"integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==",
"funding": [
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
],
"license": "BSD-2-Clause"
},
"node_modules/domhandler": {
"version": "5.0.3",
"resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz",
"integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==",
"license": "BSD-2-Clause",
"dependencies": {
"domelementtype": "^2.3.0"
},
"engines": {
"node": ">= 4"
},
"funding": {
"url": "https://github.com/fb55/domhandler?sponsor=1"
}
},
"node_modules/domutils": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz",
"integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==",
"license": "BSD-2-Clause",
"dependencies": {
"dom-serializer": "^2.0.0",
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3"
},
"funding": {
"url": "https://github.com/fb55/domutils?sponsor=1"
}
},
"node_modules/dot-prop": {
"version": "9.0.0",
"resolved": "https://registry.npmjs.org/dot-prop/-/dot-prop-9.0.0.tgz",
@ -3489,6 +3623,19 @@
"node": ">= 0.8"
}
},
"node_modules/encoding-sniffer": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/encoding-sniffer/-/encoding-sniffer-0.2.0.tgz",
"integrity": "sha512-ju7Wq1kg04I3HtiYIOrUrdfdDvkyO9s5XM8QAj/bN61Yo/Vb4vgJxy5vi4Yxk01gWHbrofpPtpxM8bKger9jhg==",
"license": "MIT",
"dependencies": {
"iconv-lite": "^0.6.3",
"whatwg-encoding": "^3.1.1"
},
"funding": {
"url": "https://github.com/fb55/encoding-sniffer?sponsor=1"
}
},
"node_modules/end-of-stream": {
"version": "1.4.4",
"resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz",
@ -3498,6 +3645,18 @@
"once": "^1.4.0"
}
},
"node_modules/entities": {
"version": "4.5.0",
"resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz",
"integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/error-stack-parser-es": {
"version": "1.0.5",
"resolved": "https://registry.npmjs.org/error-stack-parser-es/-/error-stack-parser-es-1.0.5.tgz",
@ -4653,6 +4812,25 @@
"dev": true,
"license": "MIT"
},
"node_modules/htmlparser2": {
"version": "9.1.0",
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz",
"integrity": "sha512-5zfg6mHUoaer/97TxnGpxmbR7zJtPwIYFMZ/H5ucTlPZhKvtum05yiPK3Mgai3a0DyVxv7qYqoweaEd2nrYQzQ==",
"funding": [
"https://github.com/fb55/htmlparser2?sponsor=1",
{
"type": "github",
"url": "https://github.com/sponsors/fb55"
}
],
"license": "MIT",
"dependencies": {
"domelementtype": "^2.3.0",
"domhandler": "^5.0.3",
"domutils": "^3.1.0",
"entities": "^4.5.0"
}
},
"node_modules/http-cache-semantics": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz",
@ -4735,7 +4913,6 @@
"version": "0.6.3",
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
"dev": true,
"license": "MIT",
"dependencies": {
"safer-buffer": ">= 2.1.2 < 3.0.0"
@ -5922,6 +6099,18 @@
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/nth-check": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/nth-check/-/nth-check-2.1.1.tgz",
"integrity": "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==",
"license": "BSD-2-Clause",
"dependencies": {
"boolbase": "^1.0.0"
},
"funding": {
"url": "https://github.com/fb55/nth-check?sponsor=1"
}
},
"node_modules/nypm": {
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/nypm/-/nypm-0.6.0.tgz",
@ -6148,6 +6337,55 @@
"node": ">=6"
}
},
"node_modules/parse5": {
"version": "7.3.0",
"resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz",
"integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==",
"license": "MIT",
"dependencies": {
"entities": "^6.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-htmlparser2-tree-adapter": {
"version": "7.1.0",
"resolved": "https://registry.npmjs.org/parse5-htmlparser2-tree-adapter/-/parse5-htmlparser2-tree-adapter-7.1.0.tgz",
"integrity": "sha512-ruw5xyKs6lrpo9x9rCZqZZnIUntICjQAd0Wsmp396Ul9lN/h+ifgVV1x1gZHi8euej6wTfpqX8j+BFQxF0NS/g==",
"license": "MIT",
"dependencies": {
"domhandler": "^5.0.3",
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5-parser-stream": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/parse5-parser-stream/-/parse5-parser-stream-7.1.2.tgz",
"integrity": "sha512-JyeQc9iwFLn5TbvvqACIF/VXG6abODeB3Fwmv/TGdLk2LfbWkaySGY72at4+Ty7EkPZj854u4CrICqNk2qIbow==",
"license": "MIT",
"dependencies": {
"parse5": "^7.0.0"
},
"funding": {
"url": "https://github.com/inikulin/parse5?sponsor=1"
}
},
"node_modules/parse5/node_modules/entities": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.0.tgz",
"integrity": "sha512-aKstq2TDOndCn4diEyp9Uq/Flu2i1GlLkc6XIDQSDMuaFE3OPW5OphLCyQ5SpSJZTb4reN+kTcYru5yIfXoRPw==",
"license": "BSD-2-Clause",
"engines": {
"node": ">=0.12"
},
"funding": {
"url": "https://github.com/fb55/entities?sponsor=1"
}
},
"node_modules/parseurl": {
"version": "1.3.3",
"resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz",
@ -6844,7 +7082,6 @@
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
"dev": true,
"license": "MIT"
},
"node_modules/scule": {
@ -7665,6 +7902,15 @@
"@types/estree": "^1.0.0"
}
},
"node_modules/undici": {
"version": "6.21.3",
"resolved": "https://registry.npmjs.org/undici/-/undici-6.21.3.tgz",
"integrity": "sha512-gBLkYIlEnSp8pFbT64yFgGE6UIB9tAkhukC23PmMDCe5Nd+cRqKxSjw5y54MK2AZMgZfJWMaNE4nYUHgi1XEOw==",
"license": "MIT",
"engines": {
"node": ">=18.17"
}
},
"node_modules/undici-types": {
"version": "6.20.0",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.20.0.tgz",
@ -8019,6 +8265,27 @@
"dev": true,
"license": "MIT"
},
"node_modules/whatwg-encoding": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
"license": "MIT",
"dependencies": {
"iconv-lite": "0.6.3"
},
"engines": {
"node": ">=18"
}
},
"node_modules/whatwg-mimetype": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
"license": "MIT",
"engines": {
"node": ">=18"
}
},
"node_modules/whatwg-url": {
"version": "14.2.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz",

View file

@ -19,6 +19,7 @@
},
"dependencies": {
"@prisma/client": "^6.4.1",
"cheerio": "^1.0.0",
"dotenv": "^16.4.7",
"jsonwebtoken": "^9.0.2",
"prom-client": "^15.1.3",

View file

@ -0,0 +1,147 @@
import * as cheerio from 'cheerio';
import { TMDB } from 'tmdb-ts';
const tmdb = new TMDB(useRuntimeConfig().tmdbApiKey);
export default defineCachedEventHandler(async (event) => {
try {
const response = await fetch('https://letterboxd.com/lists/');
let html = await response.text();
html = html.replace(/\\n/g, '\n').replace(/\\t/g, '\t');
const $ = cheerio.load(html);
const listItems = $('a.list-link').map((i, el) => ({
href: $(el).attr('href'),
title: $(el).find('.list-name').text().trim() || $(el).attr('title'),
text: $(el).text().trim()
})).get();
if (!listItems.length) {
return {
lists: [],
error: 'No lists found'
};
}
const allLists = [];
for (let i = 0; i < listItems.length; i++) {
const listItem = listItems[i];
if (!listItem.href) continue;
try {
const listUrl = `https://letterboxd.com${listItem.href}`;
const listResponse = await fetch(listUrl);
let listHtml = await listResponse.text();
listHtml = listHtml.replace(/\\n/g, '\n').replace(/\\t/g, '\t');
const list$ = cheerio.load(listHtml);
const ogTitle = list$('meta[property="og:title"]').attr('content');
const listName = ogTitle || listItem.title;
const listStatsText = list$('.list-meta .stats').text();
const itemCountMatch = listStatsText.match(/(\d+)\s*film/i);
const expectedItemCount = itemCountMatch ? parseInt(itemCountMatch[1]) : null;
const possibleFilmSelectors = [
'li.poster-container',
'.poster-container',
'.film-poster',
'.poster',
'li[data-film-slug]',
'[data-film-slug]',
'.listitem',
'.list-item'
];
let films = [];
let workingSelector = '';
for (const selector of possibleFilmSelectors) {
const elements = list$(selector);
if (elements.length > 0) {
workingSelector = selector;
films = elements.map((i, el) => {
const filmSlug = list$(el).attr('data-film-slug');
const targetLink = list$(el).attr('data-target-link');
const filmId = list$(el).attr('data-film-id');
const filmName = filmSlug
? filmSlug.split('-').map(word =>
word.charAt(0).toUpperCase() + word.slice(1)
).join(' ')
: targetLink?.split('/film/')[1]?.split('/')[0]?.replace(/-/g, ' ');
return {
name: filmName,
slug: filmSlug,
link: targetLink,
filmId: filmId
};
}).get().filter(film => film.name);
if (films.length > 0) break;
}
}
const tmdbIds = [];
for (const film of films) {
try {
const searchResult = await tmdb.search.movies({ query: film.name });
if (searchResult.results && searchResult.results.length > 0) {
const tmdbId = searchResult.results[0].id;
tmdbIds.push(tmdbId);
}
} catch (error) {
continue;
}
}
allLists.push({
listName: listName,
listUrl: listUrl,
tmdbIds,
metadata: {
originalFilmCount: films.length,
foundTmdbIds: tmdbIds.length,
expectedItemCount: expectedItemCount,
workingSelector
}
});
} catch (error) {
allLists.push({
listName: listItem.title,
listUrl: `https://letterboxd.com${listItem.href}`,
tmdbIds: [],
metadata: {
originalFilmCount: 0,
foundTmdbIds: 0,
expectedItemCount: null,
error: 'Failed to process list'
}
});
}
}
return {
lists: allLists,
totalLists: allLists.length,
summary: {
totalTmdbIds: allLists.reduce((sum, list) => sum + list.tmdbIds.length, 0),
totalExpectedItems: allLists.reduce((sum, list) => sum + (list.metadata.expectedItemCount || 0), 0)
}
};
} catch (error) {
throw createError({
statusCode: 500,
statusMessage: 'Failed to fetch Letterboxd lists'
});
}
});