mirror of
https://github.com/p-stream/providers.git
synced 2026-01-11 20:10:33 +00:00
512 lines
14 KiB
Markdown
512 lines
14 KiB
Markdown
# Building Scrapers
|
|
|
|
This guide covers the technical details of implementing scrapers, from basic structure to advanced patterns.
|
|
|
|
## The Combo Scraper Pattern
|
|
|
|
The most common and recommended pattern is the "combo scraper" that handles both movies and TV shows with a single function. This reduces code duplication and ensures consistent behavior.
|
|
|
|
### Basic Structure
|
|
|
|
```typescript
|
|
import { SourcererEmbed, SourcererOutput, makeSourcerer } from '@/providers/base';
|
|
import { MovieScrapeContext, ShowScrapeContext } from '@/utils/context';
|
|
import { NotFoundError } from '@/utils/errors';
|
|
|
|
// Main scraping function that handles both movies and TV shows
|
|
async function comboScraper(ctx: ShowScrapeContext | MovieScrapeContext): Promise<SourcererOutput> {
|
|
// 1. Build the appropriate URL based on media type
|
|
const embedUrl = `https://embed.su/embed/${
|
|
ctx.media.type === 'movie'
|
|
? `movie/${ctx.media.tmdbId}`
|
|
: `tv/${ctx.media.tmdbId}/${ctx.media.season.number}/${ctx.media.episode.number}`
|
|
}`;
|
|
|
|
// 2. Fetch the embed page using proxied fetcher
|
|
const embedPage = await ctx.proxiedFetcher<string>(embedUrl, {
|
|
headers: {
|
|
Referer: 'https://embed.su/',
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
|
|
},
|
|
});
|
|
|
|
// 3. Extract and decode configuration
|
|
const vConfigMatch = embedPage.match(/window\.vConfig\s*=\s*JSON\.parse\(atob\(`([^`]+)/i);
|
|
const encodedConfig = vConfigMatch?.[1];
|
|
if (!encodedConfig) throw new NotFoundError('No encoded config found');
|
|
|
|
// 4. Process the data (decode, decrypt, etc.)
|
|
const decodedConfig = JSON.parse(await stringAtob(encodedConfig));
|
|
if (!decodedConfig?.hash) throw new NotFoundError('No stream hash found');
|
|
|
|
// 5. Update progress to show we're making progress
|
|
ctx.progress(50);
|
|
|
|
// 6. Build the final result
|
|
const embeds: SourcererEmbed[] = secondDecode.map((server) => ({
|
|
embedId: 'viper', // ID of the embed scraper to handle this URL
|
|
url: `https://embed.su/api/e/${server.hash}`,
|
|
}));
|
|
|
|
ctx.progress(90);
|
|
|
|
return { embeds };
|
|
}
|
|
|
|
// Export the scraper configuration
|
|
export const embedsuScraper = makeSourcerer({
|
|
id: 'embedsu', // Unique identifier
|
|
name: 'embed.su', // Display name
|
|
rank: 165, // Priority rank (must be unique)
|
|
disabled: false, // Whether the scraper is disabled
|
|
flags: [], // Feature flags (see Advanced Concepts)
|
|
scrapeMovie: comboScraper, // Function for movies
|
|
scrapeShow: comboScraper, // Function for TV shows
|
|
});
|
|
```
|
|
|
|
### Alternative: Separate Functions
|
|
|
|
For complex cases where movie and TV show logic differs significantly. However, its best to use combo scraper!
|
|
|
|
```typescript
|
|
async function scrapeMovie(ctx: MovieScrapeContext): Promise<SourcererOutput> {
|
|
// Movie-specific logic
|
|
const movieUrl = `${baseUrl}/movie/${ctx.media.tmdbId}`;
|
|
// ... movie processing
|
|
}
|
|
|
|
async function scrapeShow(ctx: ShowScrapeContext): Promise<SourcererOutput> {
|
|
// TV show-specific logic
|
|
const showUrl = `${baseUrl}/tv/${ctx.media.tmdbId}/${ctx.media.season.number}/${ctx.media.episode.number}`;
|
|
// ... show processing
|
|
}
|
|
|
|
export const myScraper = makeSourcerer({
|
|
id: 'my-scraper',
|
|
name: 'My Scraper',
|
|
rank: 150,
|
|
disabled: false,
|
|
flags: [],
|
|
scrapeMovie: scrapeMovie, // Separate functions
|
|
scrapeShow: scrapeShow,
|
|
});
|
|
```
|
|
|
|
## Return Types
|
|
|
|
A `SourcererOutput` can return two types of data. Understanding when to use each is crucial:
|
|
|
|
### 1. Embeds Array (Most Common)
|
|
|
|
Use when your scraper finds embed players that need further processing:
|
|
|
|
```typescript
|
|
return {
|
|
embeds: [
|
|
{
|
|
embedId: 'turbovid', // Must match an existing embed scraper ID
|
|
url: 'https://turbovid.com/embed/abc123'
|
|
},
|
|
{
|
|
embedId: 'mixdrop', // Backup option
|
|
url: 'https://mixdrop.co/embed/def456'
|
|
}
|
|
]
|
|
};
|
|
```
|
|
|
|
**When to use:**
|
|
- Your scraper finds embed player URLs
|
|
- You want to leverage existing embed scrapers
|
|
- The site uses common players (turbovid, mixdrop, etc.)
|
|
- You want to provide multiple server options
|
|
|
|
### 2. Stream Array (Direct Streams)
|
|
|
|
Use when your scraper finds direct video streams that are ready to play:
|
|
|
|
```typescript
|
|
import { flags } from '@/entrypoint/utils/targets';
|
|
|
|
// For HLS streams
|
|
return {
|
|
embeds: [], // Can be empty when returning streams
|
|
stream: [
|
|
{
|
|
id: 'primary',
|
|
type: 'hls',
|
|
playlist: streamUrl,
|
|
flags: [flags.CORS_ALLOWED],
|
|
captions: [], // Subtitle tracks (optional)
|
|
}
|
|
]
|
|
};
|
|
|
|
// For MP4 files with a single quality
|
|
return {
|
|
embeds: [],
|
|
stream: [
|
|
{
|
|
id: 'primary',
|
|
captions,
|
|
qualities: {
|
|
unknown: {
|
|
type: 'mp4',
|
|
url: streamUrl,
|
|
},
|
|
},
|
|
type: 'file',
|
|
flags: [flags.CORS_ALLOWED],
|
|
},
|
|
],
|
|
};
|
|
|
|
// For MP4 files with multiple qualities:
|
|
// It's recommended to return it using a function similar to this:
|
|
|
|
const streams = Object.entries(data.streams).reduce((acc: Record<string, string>, [quality, url]) => {
|
|
let qualityKey: number;
|
|
if (quality === 'ORG') {
|
|
// Only add unknown quality if it's an mp4 (handle URLs with query parameters)
|
|
const urlPath = url.split('?')[0]; // Remove query parameters
|
|
if (urlPath.toLowerCase().endsWith('.mp4')) {
|
|
acc.unknown = url;
|
|
}
|
|
return acc;
|
|
}
|
|
if (quality === '4K') {
|
|
qualityKey = 2160;
|
|
} else {
|
|
qualityKey = parseInt(quality.replace('P', ''), 10);
|
|
}
|
|
if (Number.isNaN(qualityKey) || acc[qualityKey]) return acc;
|
|
acc[qualityKey] = url;
|
|
return acc;
|
|
}, {});
|
|
|
|
// Filter qualities based on provider type
|
|
const filteredStreams = Object.entries(streams).reduce((acc: Record<string, string>, [quality, url]) => {
|
|
// Skip unknown for cached provider
|
|
if (provider.useCacheUrl && quality === 'unknown') {
|
|
return acc;
|
|
}
|
|
|
|
acc[quality] = url;
|
|
return acc;
|
|
}, {});
|
|
|
|
// Returning each quality like so
|
|
return {
|
|
stream: [
|
|
{
|
|
id: 'primary',
|
|
captions: [],
|
|
qualities: {
|
|
...(filteredStreams[2160] && {
|
|
'4k': {
|
|
type: 'mp4',
|
|
url: filteredStreams[2160],
|
|
},
|
|
}),
|
|
...(filteredStreams[1080] && {
|
|
1080: {
|
|
type: 'mp4',
|
|
url: filteredStreams[1080],
|
|
},
|
|
}),
|
|
...(filteredStreams[720] && {
|
|
720: {
|
|
type: 'mp4',
|
|
url: filteredStreams[720],
|
|
},
|
|
}),
|
|
...(filteredStreams[480] && {
|
|
480: {
|
|
type: 'mp4',
|
|
url: filteredStreams[480],
|
|
},
|
|
}),
|
|
...(filteredStreams[360] && {
|
|
360: {
|
|
type: 'mp4',
|
|
url: filteredStreams[360],
|
|
},
|
|
}),
|
|
...(filteredStreams.unknown && {
|
|
unknown: {
|
|
type: 'mp4',
|
|
url: filteredStreams.unknown,
|
|
},
|
|
}),
|
|
},
|
|
type: 'file',
|
|
flags: [flags.CORS_ALLOWED],
|
|
},
|
|
],
|
|
};
|
|
```
|
|
|
|
**When to use:**
|
|
- Your scraper can extract direct video URLs
|
|
- The site provides its own player technology
|
|
- You need fine control over stream handling
|
|
- The streams don't require complex embed processing
|
|
|
|
## Context and Utilities
|
|
|
|
The scraper context (`ctx`) provides everything you need for implementation:
|
|
|
|
### Media Information
|
|
```typescript
|
|
// Basic media info (always available)
|
|
ctx.media.title // "Spirited Away"
|
|
ctx.media.type // "movie" | "show"
|
|
ctx.media.tmdbId // 129
|
|
ctx.media.releaseYear // 2001
|
|
ctx.media.imdbId // "tt0245429" (when available)
|
|
|
|
// For TV shows only (check ctx.media.type === 'show')
|
|
ctx.media.season.number // 1
|
|
ctx.media.season.tmdbId // Season TMDB ID
|
|
ctx.media.episode.number // 5
|
|
ctx.media.episode.tmdbId // Episode TMDB ID
|
|
```
|
|
|
|
### HTTP Client
|
|
```typescript
|
|
// Always use proxiedFetcher for external requests to avoid CORS
|
|
const response = await ctx.proxiedFetcher<string>('https://example.com/api', {
|
|
method: 'POST',
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0...',
|
|
'Referer': 'https://example.com'
|
|
},
|
|
body: JSON.stringify({ key: 'value' })
|
|
});
|
|
|
|
// For API calls with base URL
|
|
const data = await ctx.proxiedFetcher('/search', {
|
|
baseUrl: 'https://api.example.com',
|
|
query: { q: ctx.media.title, year: ctx.media.releaseYear }
|
|
});
|
|
```
|
|
|
|
### Progress Updates
|
|
```typescript
|
|
// Update the loading indicator (0-100)
|
|
ctx.progress(25); // Found media page
|
|
// ... processing ...
|
|
ctx.progress(50); // Extracted embed links
|
|
// ... more processing ...
|
|
ctx.progress(90); // Almost done
|
|
```
|
|
|
|
## Common Patterns
|
|
|
|
### 1. URL Building
|
|
```typescript
|
|
// Handle different media types
|
|
const buildUrl = (ctx: ShowScrapeContext | MovieScrapeContext) => {
|
|
const apiUrl = ctx.media.type === 'movie'
|
|
? `${baseUrl}/movie/${ctx.media.tmdbId}`
|
|
: `${baseUrl}/tv/${ctx.media.tmdbId}/${ctx.media.season.number}/${ctx.media.episode.number}`;
|
|
|
|
return apiUrl;
|
|
};
|
|
```
|
|
|
|
### 2. Data Extraction
|
|
```typescript
|
|
import { load } from 'cheerio';
|
|
|
|
// Scraping with Cheerio
|
|
const $ = load(embedPage);
|
|
const embedUrls = $('iframe[src*="turbovid"]')
|
|
.map((_, el) => $(el).attr('src'))
|
|
.get()
|
|
.filter(Boolean);
|
|
|
|
// Regex extraction
|
|
const configMatch = embedPage.match(/window\.playerConfig\s*=\s*({.*?});/s);
|
|
if (configMatch) {
|
|
const config = JSON.parse(configMatch[1]);
|
|
// Process config...
|
|
}
|
|
```
|
|
|
|
### 3. Error Handling
|
|
```typescript
|
|
import { NotFoundError } from '@/utils/errors';
|
|
|
|
// Throw NotFoundError for content not found
|
|
if (!embedUrls.length) {
|
|
throw new NotFoundError('No embed players found');
|
|
}
|
|
|
|
// Throw generic Error for other issues
|
|
if (!apiResponse.success) {
|
|
throw new Error(`API request failed: ${apiResponse.message}`);
|
|
}
|
|
```
|
|
|
|
### 4. Protected Streams
|
|
There are several ways to bypass protections on streams.
|
|
|
|
Using the M3U8 proxy:
|
|
```typescript
|
|
import { createM3U8ProxyUrl } from '@/utils/proxy';
|
|
|
|
// For streams that require special headers
|
|
const streamHeaders = {
|
|
'Referer': 'https://player.example.com/',
|
|
'Origin': 'https://player.example.com',
|
|
'User-Agent': 'Mozilla/5.0...'
|
|
};
|
|
|
|
return {
|
|
stream: [{
|
|
id: 'primary',
|
|
type: 'hls',
|
|
playlist: createM3U8ProxyUrl(originalPlaylist, ctx.features, streamHeaders),
|
|
headers: streamHeaders, // Include headers in the createM3U8ProxyUrl function and here for native and extension targets
|
|
flags: [flags.CORS_ALLOWED], // createM3U8ProxyUrl (or the extension) bypasses cors so we say it's allowed to play in a browser
|
|
captions: []
|
|
}]
|
|
};
|
|
```
|
|
|
|
Using the browser extension:
|
|
```typescript
|
|
// For streams that require special headers
|
|
const streamHeaders = {
|
|
'Referer': 'https://player.example.com/',
|
|
'Origin': 'https://player.example.com',
|
|
'User-Agent': 'Mozilla/5.0...'
|
|
};
|
|
|
|
return {
|
|
stream: [{
|
|
id: 'primary',
|
|
type: 'hls',
|
|
playlist: originalPlaylist,
|
|
headers: streamHeaders,
|
|
flags: [], // Use the extension becuase it can pass headers, include no flag for extension or native
|
|
captions: []
|
|
}]
|
|
};
|
|
```
|
|
|
|
## Building Embed Scrapers
|
|
|
|
Embed scrapers follow a simpler pattern since they only handle one URL type:
|
|
|
|
```typescript
|
|
import { makeEmbed } from '@/providers/base';
|
|
|
|
export const myEmbedScraper = makeEmbed({
|
|
id: 'my-embed',
|
|
name: 'My Embed Player',
|
|
rank: 120,
|
|
async scrape(ctx) {
|
|
// ctx.url contains the embed URL from a source
|
|
|
|
// 1. Fetch the embed page
|
|
const embedPage = await ctx.proxiedFetcher(ctx.url);
|
|
|
|
// 2. Extract the stream URL (example with regex)
|
|
const streamMatch = embedPage.match(/src:\s*["']([^"']+\.m3u8[^"']*)/);
|
|
if (!streamMatch) {
|
|
throw new NotFoundError('No stream found in embed');
|
|
}
|
|
|
|
// 3. Return the stream
|
|
return {
|
|
stream: [{
|
|
id: 'primary',
|
|
type: 'hls',
|
|
playlist: streamMatch[1],
|
|
flags: [flags.CORS_ALLOWED],
|
|
captions: []
|
|
}]
|
|
};
|
|
},
|
|
});
|
|
```
|
|
|
|
## Testing Your Scrapers
|
|
|
|
### 1. Basic Testing
|
|
```sh
|
|
# Test your scraper with CLI
|
|
pnpm cli --source-id my-scraper --tmdb-id 11527
|
|
|
|
# Test different content types
|
|
pnpm cli --source-id my-scraper --tmdb-id 94605 --season 1 --episode 1 # TV show
|
|
```
|
|
|
|
### 2. Real CLI Output Examples
|
|
|
|
**Testing a source that returns embeds:**
|
|
```sh
|
|
pnpm cli --source-id catflix --tmdb-id 11527
|
|
```
|
|
```json
|
|
{
|
|
embeds: [
|
|
{
|
|
embedId: 'turbovid',
|
|
url: 'https://turbovid.eu/embed/DjncbDBEmbLW'
|
|
}
|
|
]
|
|
}
|
|
```
|
|
|
|
**Testing an embed that returns streams:**
|
|
```sh
|
|
pnpm cli --source-id turbovid --url "https://turbovid.eu/embed/DjncbDBEmbLW"
|
|
```
|
|
```json
|
|
{
|
|
stream: [
|
|
{
|
|
type: 'hls',
|
|
id: 'primary',
|
|
playlist: 'https://proxy.fifthwit.net/m3u8-proxy?url=https%3A%2F%2Fqueenselti.pro%2Fwrofm%2Fuwu.m3u8&headers=%7B%22referer%22%3A%22https%3A%2F%2Fturbovid.eu%2F%22%2C%22origin%22%3A%2F%2Fturbovid.eu%22%7D',
|
|
flags: [flags.CORS_ALLOWED],
|
|
captions: []
|
|
}
|
|
]
|
|
}
|
|
```
|
|
|
|
**Notice**: The playlist URL shows how `createM3U8ProxyUrl()` creates proxied URLs to handle protected streams.
|
|
|
|
### 3. Comprehensive Testing
|
|
Test with various content:
|
|
- Popular movies (The Shining: 11527, Spirited Away: 129, Avatar: 19995)
|
|
- Recent releases (check current popular movies)
|
|
- TV shows with multiple seasons
|
|
- Anime series (different episode numbering)
|
|
- Different languages/regions
|
|
|
|
### 4. Debug Mode
|
|
```sh
|
|
# Add debug logging to your scraper
|
|
console.log('Fetching URL:', embedUrl);
|
|
console.log('Response status:', response.status);
|
|
console.log('Extracted data:', extractedData);
|
|
```
|
|
|
|
## Next Steps
|
|
|
|
Once you've built your scraper:
|
|
|
|
1. Test thoroughly with multiple content types
|
|
2. Check [Advanced Concepts](/in-depth/advanced-concepts) for flags and error handling
|
|
3. Register in `all.ts` with a unique rank
|
|
4. Submit a pull request with testing documentation
|
|
|
|
::alert{type="warning"}
|
|
Always test your scrapers with both movies and TV shows, and include multiple examples in your pull request description.
|
|
::
|