pstreams-providers/.docs/content/3.in-depth/3.building-scrapers.md

539 lines
14 KiB
Markdown

# Building Scrapers
This guide covers the technical details of implementing scrapers, from basic structure to advanced patterns.
## The Combo Scraper Pattern
The most common and recommended pattern is the "combo scraper" that handles both movies and TV shows with a single function. This reduces code duplication and ensures consistent behavior.
### Basic Structure
```typescript
import { SourcererEmbed, SourcererOutput, makeSourcerer } from '@/providers/base';
import { MovieScrapeContext, ShowScrapeContext } from '@/utils/context';
import { NotFoundError } from '@/utils/errors';
// Main scraping function that handles both movies and TV shows
async function comboScraper(ctx: ShowScrapeContext | MovieScrapeContext): Promise<SourcererOutput> {
// 1. Build the appropriate URL based on media type
const embedUrl = `https://embed.su/embed/${
ctx.media.type === 'movie'
? `movie/${ctx.media.tmdbId}`
: `tv/${ctx.media.tmdbId}/${ctx.media.season.number}/${ctx.media.episode.number}`
}`;
// 2. Fetch the embed page using proxied fetcher
const embedPage = await ctx.proxiedFetcher<string>(embedUrl, {
headers: {
Referer: 'https://embed.su/',
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
},
});
// 3. Extract and decode configuration
const vConfigMatch = embedPage.match(/window\.vConfig\s*=\s*JSON\.parse\(atob\(`([^`]+)/i);
const encodedConfig = vConfigMatch?.[1];
if (!encodedConfig) throw new NotFoundError('No encoded config found');
// 4. Process the data (decode, decrypt, etc.)
const decodedConfig = JSON.parse(await stringAtob(encodedConfig));
if (!decodedConfig?.hash) throw new NotFoundError('No stream hash found');
// 5. Update progress to show we're making progress
ctx.progress(50);
// 6. Build the final result
const embeds: SourcererEmbed[] = secondDecode.map((server) => ({
embedId: 'viper', // ID of the embed scraper to handle this URL
url: `https://embed.su/api/e/${server.hash}`,
}));
ctx.progress(90);
return { embeds };
}
// Export the scraper configuration
export const embedsuScraper = makeSourcerer({
id: 'embedsu', // Unique identifier
name: 'embed.su', // Display name
rank: 165, // Priority rank (must be unique)
disabled: false, // Whether the scraper is disabled
flags: [], // Feature flags (see Advanced Concepts)
scrapeMovie: comboScraper, // Function for movies
scrapeShow: comboScraper, // Function for TV shows
});
```
### Alternative: Separate Functions
For complex cases where movie and TV show logic differs significantly. However, its best to use combo scraper!
```typescript
async function scrapeMovie(ctx: MovieScrapeContext): Promise<SourcererOutput> {
// Movie-specific logic
const movieUrl = `${baseUrl}/movie/${ctx.media.tmdbId}`;
// ... movie processing
}
async function scrapeShow(ctx: ShowScrapeContext): Promise<SourcererOutput> {
// TV show-specific logic
const showUrl = `${baseUrl}/tv/${ctx.media.tmdbId}/${ctx.media.season.number}/${ctx.media.episode.number}`;
// ... show processing
}
export const myScraper = makeSourcerer({
id: 'my-scraper',
name: 'My Scraper',
rank: 150,
disabled: false,
flags: [],
scrapeMovie: scrapeMovie, // Separate functions
scrapeShow: scrapeShow,
});
```
## Return Types
A `SourcererOutput` can return two types of data. Understanding when to use each is crucial:
### 1. Embeds Array (Most Common)
Use when your scraper finds embed players that need further processing:
```typescript
return {
embeds: [
{
embedId: 'turbovid', // Must match an existing embed scraper ID
url: 'https://turbovid.com/embed/abc123',
},
{
embedId: 'mixdrop', // Backup option
url: 'https://mixdrop.co/embed/def456',
},
],
};
```
**When to use:**
- Your scraper finds embed player URLs
- You want to leverage existing embed scrapers
- The site uses common players (turbovid, mixdrop, etc.)
- You want to provide multiple server options
### 2. Stream Array (Direct Streams)
Use when your scraper finds direct video streams that are ready to play:
```typescript
import { flags } from '@/entrypoint/utils/targets';
// For HLS streams
return {
embeds: [], // Can be empty when returning streams
stream: [
{
id: 'primary',
type: 'hls',
playlist: streamUrl,
flags: [flags.CORS_ALLOWED],
captions: [], // Subtitle tracks (optional)
},
],
};
// For MP4 files with a single quality
return {
embeds: [],
stream: [
{
id: 'primary',
captions,
qualities: {
unknown: {
type: 'mp4',
url: streamUrl,
},
},
type: 'file',
flags: [flags.CORS_ALLOWED],
},
],
};
// For MP4 files with multiple qualities:
// It's recommended to return it using a function similar to this:
const streams = Object.entries(data.streams).reduce((acc: Record<string, string>, [quality, url]) => {
let qualityKey: number;
if (quality === 'ORG') {
// Only add unknown quality if it's an mp4 (handle URLs with query parameters)
const urlPath = url.split('?')[0]; // Remove query parameters
if (urlPath.toLowerCase().endsWith('.mp4')) {
acc.unknown = url;
}
return acc;
}
if (quality === '4K') {
qualityKey = 2160;
} else {
qualityKey = parseInt(quality.replace('P', ''), 10);
}
if (Number.isNaN(qualityKey) || acc[qualityKey]) return acc;
acc[qualityKey] = url;
return acc;
}, {});
// Filter qualities based on provider type
const filteredStreams = Object.entries(streams).reduce((acc: Record<string, string>, [quality, url]) => {
// Skip unknown for cached provider
if (provider.useCacheUrl && quality === 'unknown') {
return acc;
}
acc[quality] = url;
return acc;
}, {});
// Returning each quality like so
return {
stream: [
{
id: 'primary',
captions: [],
qualities: {
...(filteredStreams[2160] && {
'4k': {
type: 'mp4',
url: filteredStreams[2160],
},
}),
...(filteredStreams[1080] && {
1080: {
type: 'mp4',
url: filteredStreams[1080],
},
}),
...(filteredStreams[720] && {
720: {
type: 'mp4',
url: filteredStreams[720],
},
}),
...(filteredStreams[480] && {
480: {
type: 'mp4',
url: filteredStreams[480],
},
}),
...(filteredStreams[360] && {
360: {
type: 'mp4',
url: filteredStreams[360],
},
}),
...(filteredStreams.unknown && {
unknown: {
type: 'mp4',
url: filteredStreams.unknown,
},
}),
},
type: 'file',
flags: [flags.CORS_ALLOWED],
},
],
};
```
**When to use:**
- Your scraper can extract direct video URLs
- The site provides its own player technology
- You need fine control over stream handling
- The streams don't require complex embed processing
## Context and Utilities
The scraper context (`ctx`) provides everything you need for implementation:
### Media Information
```typescript
// Basic media info (always available)
ctx.media.title; // "Spirited Away"
ctx.media.type; // "movie" | "show"
ctx.media.tmdbId; // 129
ctx.media.releaseYear; // 2001
ctx.media.imdbId; // "tt0245429" (when available)
// For TV shows only (check ctx.media.type === 'show')
ctx.media.season.number; // 1
ctx.media.season.tmdbId; // Season TMDB ID
ctx.media.episode.number; // 5
ctx.media.episode.tmdbId; // Episode TMDB ID
```
### HTTP Client
```typescript
// Always use proxiedFetcher for external requests to avoid CORS
const response = await ctx.proxiedFetcher<string>('https://example.com/api', {
method: 'POST',
headers: {
'User-Agent': 'Mozilla/5.0...',
Referer: 'https://example.com',
},
body: JSON.stringify({ key: 'value' }),
});
// For API calls with base URL
const data = await ctx.proxiedFetcher('/search', {
baseUrl: 'https://api.example.com',
query: { q: ctx.media.title, year: ctx.media.releaseYear },
});
```
### Progress Updates
```typescript
// Update the loading indicator (0-100)
ctx.progress(25); // Found media page
// ... processing ...
ctx.progress(50); // Extracted embed links
// ... more processing ...
ctx.progress(90); // Almost done
```
## Common Patterns
### 1. URL Building
```typescript
// Handle different media types
const buildUrl = (ctx: ShowScrapeContext | MovieScrapeContext) => {
const apiUrl =
ctx.media.type === 'movie'
? `${baseUrl}/movie/${ctx.media.tmdbId}`
: `${baseUrl}/tv/${ctx.media.tmdbId}/${ctx.media.season.number}/${ctx.media.episode.number}`;
return apiUrl;
};
```
### 2. Data Extraction
```typescript
import { load } from 'cheerio';
// Scraping with Cheerio
const $ = load(embedPage);
const embedUrls = $('iframe[src*="turbovid"]')
.map((_, el) => $(el).attr('src'))
.get()
.filter(Boolean);
// Regex extraction
const configMatch = embedPage.match(/window\.playerConfig\s*=\s*({.*?});/s);
if (configMatch) {
const config = JSON.parse(configMatch[1]);
// Process config...
}
```
### 3. Error Handling
```typescript
import { NotFoundError } from '@/utils/errors';
// Throw NotFoundError for content not found
if (!embedUrls.length) {
throw new NotFoundError('No embed players found');
}
// Throw generic Error for other issues
if (!apiResponse.success) {
throw new Error(`API request failed: ${apiResponse.message}`);
}
```
### 4. Protected Streams
There are several ways to bypass protections on streams.
Using the M3U8 proxy:
```typescript
import { createM3U8ProxyUrl } from '@/utils/proxy';
// For streams that require special headers
const streamHeaders = {
Referer: 'https://player.example.com/',
Origin: 'https://player.example.com',
'User-Agent': 'Mozilla/5.0...',
};
return {
stream: [
{
id: 'primary',
type: 'hls',
playlist: createM3U8ProxyUrl(originalPlaylist, ctx.features, streamHeaders),
headers: streamHeaders, // Include headers in the createM3U8ProxyUrl function and here for native and extension targets
flags: [flags.CORS_ALLOWED], // createM3U8ProxyUrl (or the extension) bypasses cors so we say it's allowed to play in a browser
captions: [],
},
],
};
```
Using the browser extension:
```typescript
// For streams that require special headers
const streamHeaders = {
Referer: 'https://player.example.com/',
Origin: 'https://player.example.com',
'User-Agent': 'Mozilla/5.0...',
};
return {
stream: [
{
id: 'primary',
type: 'hls',
playlist: originalPlaylist,
headers: streamHeaders,
flags: [], // Use the extension becuase it can pass headers, include no flag for extension or native
captions: [],
},
],
};
```
## Building Embed Scrapers
Embed scrapers follow a simpler pattern since they only handle one URL type:
```typescript
import { makeEmbed } from '@/providers/base';
export const myEmbedScraper = makeEmbed({
id: 'my-embed',
name: 'My Embed Player',
rank: 120,
async scrape(ctx) {
// ctx.url contains the embed URL from a source
// 1. Fetch the embed page
const embedPage = await ctx.proxiedFetcher(ctx.url);
// 2. Extract the stream URL (example with regex)
const streamMatch = embedPage.match(/src:\s*["']([^"']+\.m3u8[^"']*)/);
if (!streamMatch) {
throw new NotFoundError('No stream found in embed');
}
// 3. Return the stream
return {
stream: [
{
id: 'primary',
type: 'hls',
playlist: streamMatch[1],
flags: [flags.CORS_ALLOWED],
captions: [],
},
],
};
},
});
```
## Testing Your Scrapers
### 1. Basic Testing
```sh
# Test your scraper with CLI
pnpm cli --source-id my-scraper --tmdb-id 11527
# Test different content types
pnpm cli --source-id my-scraper --tmdb-id 94605 --season 1 --episode 1 # TV show
```
### 2. Real CLI Output Examples
**Testing a source that returns embeds:**
```sh
pnpm cli --source-id catflix --tmdb-id 11527
```
```json
{
"embeds": [
{
"embedId": "turbovid",
"url": "https://turbovid.eu/embed/DjncbDBEmbLW"
}
]
}
```
**Testing an embed that returns streams:**
```sh
pnpm cli --source-id turbovid --url "https://turbovid.eu/embed/DjncbDBEmbLW"
```
```json
{
stream: [
{
type: 'hls',
id: 'primary',
playlist: 'https://proxy.fifthwit.net/m3u8-proxy?url=https%3A%2F%2Fqueenselti.pro%2Fwrofm%2Fuwu.m3u8&headers=%7B%22referer%22%3A%22https%3A%2F%2Fturbovid.eu%2F%22%2C%22origin%22%3A%2F%2Fturbovid.eu%22%7D',
flags: [flags.CORS_ALLOWED],
captions: []
}
]
}
```
**Notice**: The playlist URL shows how `createM3U8ProxyUrl()` creates proxied URLs to handle protected streams.
### 3. Comprehensive Testing
Test with various content:
- Popular movies (The Shining: 11527, Spirited Away: 129, Avatar: 19995)
- Recent releases (check current popular movies)
- TV shows with multiple seasons
- Anime series (different episode numbering)
- Different languages/regions
### 4. Debug Mode
```sh
# Add debug logging to your scraper
console.log('Fetching URL:', embedUrl);
console.log('Response status:', response.status);
console.log('Extracted data:', extractedData);
```
## Next Steps
Once you've built your scraper:
1. Test thoroughly with multiple content types
2. Check [Advanced Concepts](/in-depth/advanced-concepts) for flags and error handling
3. Register in `all.ts` with a unique rank
4. Submit a pull request with testing documentation
::alert{type="warning"}
Always test your scrapers with both movies and TV shows, and include multiple examples in your pull request description.
::