mirror of
https://github.com/p-stream/p-stream.git
synced 2026-03-28 11:38:43 +00:00
attempt to bruteforce fix utf8 subs
This commit is contained in:
parent
8f5728ebf0
commit
2d8b4f99fd
3 changed files with 81 additions and 12 deletions
|
|
@ -1,7 +1,10 @@
|
|||
import { list } from "subsrt-ts";
|
||||
|
||||
import { proxiedFetch } from "@/backend/helpers/fetch";
|
||||
import { convertSubtitlesToSrt } from "@/components/player/utils/captions";
|
||||
import {
|
||||
convertSubtitlesToSrt,
|
||||
fixUTF8Encoding,
|
||||
} from "@/components/player/utils/captions";
|
||||
import { CaptionListItem } from "@/stores/player/slices/source";
|
||||
import { SimpleCache } from "@/utils/cache";
|
||||
|
||||
|
|
@ -62,13 +65,14 @@ export async function downloadCaption(
|
|||
}
|
||||
if (!data) throw new Error("failed to get caption data");
|
||||
|
||||
// Ensure the data is in UTF-8
|
||||
// Ensure the data is in UTF-8 and fix any encoding issues
|
||||
const encoder = new TextEncoder();
|
||||
const decoder = new TextDecoder("utf-8");
|
||||
const utf8Bytes = encoder.encode(data);
|
||||
const utf8Data = decoder.decode(utf8Bytes);
|
||||
const fixedData = fixUTF8Encoding(utf8Data);
|
||||
|
||||
const output = convertSubtitlesToSrt(utf8Data);
|
||||
const output = convertSubtitlesToSrt(fixedData);
|
||||
downloadCache.set(caption.url, output, expirySeconds);
|
||||
return output;
|
||||
}
|
||||
|
|
@ -93,11 +97,12 @@ export async function downloadWebVTT(url: string): Promise<string> {
|
|||
const decoder = new TextDecoder(charset);
|
||||
const data = decoder.decode(buffer);
|
||||
|
||||
// Ensure the data is in UTF-8
|
||||
// Ensure the data is in UTF-8 and fix any encoding issues
|
||||
const encoder = new TextEncoder();
|
||||
const utf8Bytes = encoder.encode(data);
|
||||
const utf8Data = decoder.decode(utf8Bytes);
|
||||
const fixedData = fixUTF8Encoding(utf8Data);
|
||||
|
||||
downloadCache.set(url, utf8Data, expirySeconds);
|
||||
return utf8Data;
|
||||
downloadCache.set(url, fixedData, expirySeconds);
|
||||
return fixedData;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import { Icon, Icons } from "@/components/Icon";
|
|||
import { useCaptions } from "@/components/player/hooks/useCaptions";
|
||||
import { Menu } from "@/components/player/internals/ContextMenu";
|
||||
import { SelectableLink } from "@/components/player/internals/ContextMenu/Links";
|
||||
import { fixUTF8Encoding } from "@/components/player/utils/captions";
|
||||
import { useOverlayRouter } from "@/hooks/useOverlayRouter";
|
||||
import { usePlayerStore } from "@/stores/player/store";
|
||||
import { useSubtitleStore } from "@/stores/subtitles";
|
||||
|
|
@ -151,13 +152,14 @@ export function CustomCaptionOption() {
|
|||
if (!event.target || typeof event.target.result !== "string")
|
||||
return;
|
||||
|
||||
// Ensure the data is in UTF-8
|
||||
// Ensure the data is in UTF-8 and fix any encoding issues
|
||||
const encoder = new TextEncoder();
|
||||
const decoder = new TextDecoder("utf-8");
|
||||
const utf8Bytes = encoder.encode(event.target.result);
|
||||
const utf8Data = decoder.decode(utf8Bytes);
|
||||
const fixedData = fixUTF8Encoding(utf8Data);
|
||||
|
||||
const converted = convert(utf8Data, "srt");
|
||||
const converted = convert(fixedData, "srt");
|
||||
setCaption({
|
||||
language: "custom",
|
||||
srtData: converted,
|
||||
|
|
@ -203,13 +205,14 @@ export function CaptionsView({
|
|||
reader.addEventListener("load", (e) => {
|
||||
if (!e.target || typeof e.target.result !== "string") return;
|
||||
|
||||
// Ensure the data is in UTF-8
|
||||
// Ensure the data is in UTF-8 and fix any encoding issues
|
||||
const encoder = new TextEncoder();
|
||||
const decoder = new TextDecoder("utf-8");
|
||||
const utf8Bytes = encoder.encode(e.target.result);
|
||||
const utf8Data = decoder.decode(utf8Bytes);
|
||||
const fixedData = fixUTF8Encoding(utf8Data);
|
||||
|
||||
const converted = convert(utf8Data, "srt");
|
||||
const converted = convert(fixedData, "srt");
|
||||
|
||||
setCaption({
|
||||
language: "custom",
|
||||
|
|
|
|||
|
|
@ -8,6 +8,63 @@ import { CaptionListItem } from "@/stores/player/slices/source";
|
|||
export type CaptionCueType = ContentCaption;
|
||||
export const sanitize = DOMPurify.sanitize;
|
||||
|
||||
// UTF-8 character mapping for fixing corrupted special characters
|
||||
const utf8Map: Record<string, string> = {
|
||||
"ä": "ä",
|
||||
"Ä": "Ä",
|
||||
"ä": "ä",
|
||||
"Ä": "Ä",
|
||||
"ö": "ö",
|
||||
"ö": "ö",
|
||||
"Ã¥": "å",
|
||||
"Ã¥": "å",
|
||||
"é": "é",
|
||||
"é": "é",
|
||||
ú: "ú",
|
||||
ú: "ú",
|
||||
"ñ": "ñ",
|
||||
"ñ": "ñ",
|
||||
"á": "á",
|
||||
"á": "á",
|
||||
"ÃÂ": "í",
|
||||
"Ã": "í",
|
||||
"ó": "ó",
|
||||
"ó": "ó",
|
||||
"ü": "ü",
|
||||
"ü": "ü",
|
||||
"ç": "ç",
|
||||
"ç": "ç",
|
||||
"è": "è",
|
||||
"è": "è",
|
||||
"ì": "ì",
|
||||
"ì": "ì",
|
||||
"ò": "ò",
|
||||
"ò": "ò",
|
||||
"ù": "ù",
|
||||
"ù": "ù",
|
||||
ÃÂ: "à",
|
||||
Ã: "à",
|
||||
"Â": "",
|
||||
Â: "",
|
||||
"Â ": "",
|
||||
};
|
||||
|
||||
/**
|
||||
* Fixes UTF-8 encoding issues in subtitle text
|
||||
* Handles common cases where special characters and accents get corrupted
|
||||
*
|
||||
* Example:
|
||||
* Input: "Hyvä on, ohjelma oli tässä."
|
||||
* Output: "Hyvä on, ohjelma oli tässä."
|
||||
*/
|
||||
export function fixUTF8Encoding(text: string): string {
|
||||
let fixedText = text;
|
||||
Object.keys(utf8Map).forEach((bad) => {
|
||||
fixedText = fixedText.split(bad).join(utf8Map[bad]);
|
||||
});
|
||||
return fixedText;
|
||||
}
|
||||
|
||||
export function captionIsVisible(
|
||||
start: number,
|
||||
end: number,
|
||||
|
|
@ -31,7 +88,9 @@ export function convertSubtitlesToVtt(text: string): string {
|
|||
if (textTrimmed === "") {
|
||||
throw new Error("Given text is empty");
|
||||
}
|
||||
const vtt = convert(textTrimmed, "vtt");
|
||||
// Fix UTF-8 encoding issues before conversion
|
||||
const fixedText = fixUTF8Encoding(textTrimmed);
|
||||
const vtt = convert(fixedText, "vtt");
|
||||
if (detect(vtt) === "") {
|
||||
throw new Error("Invalid subtitle format");
|
||||
}
|
||||
|
|
@ -43,7 +102,9 @@ export function convertSubtitlesToSrt(text: string): string {
|
|||
if (textTrimmed === "") {
|
||||
throw new Error("Given text is empty");
|
||||
}
|
||||
const srt = convert(textTrimmed, "srt");
|
||||
// Fix UTF-8 encoding issues before conversion
|
||||
const fixedText = fixUTF8Encoding(textTrimmed);
|
||||
const srt = convert(fixedText, "srt");
|
||||
if (detect(srt) === "") {
|
||||
throw new Error("Invalid subtitle format");
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue