From 2d8b4f99fdf7e30159d0ac17d09d8928d4f7407a Mon Sep 17 00:00:00 2001 From: Pas <74743263+Pasithea0@users.noreply.github.com> Date: Fri, 1 Aug 2025 15:23:19 -0600 Subject: [PATCH] attempt to bruteforce fix utf8 subs --- src/backend/helpers/subs.ts | 17 +++-- .../player/atoms/settings/CaptionsView.tsx | 11 ++-- src/components/player/utils/captions.ts | 65 ++++++++++++++++++- 3 files changed, 81 insertions(+), 12 deletions(-) diff --git a/src/backend/helpers/subs.ts b/src/backend/helpers/subs.ts index a49aa247..32b9480d 100644 --- a/src/backend/helpers/subs.ts +++ b/src/backend/helpers/subs.ts @@ -1,7 +1,10 @@ import { list } from "subsrt-ts"; import { proxiedFetch } from "@/backend/helpers/fetch"; -import { convertSubtitlesToSrt } from "@/components/player/utils/captions"; +import { + convertSubtitlesToSrt, + fixUTF8Encoding, +} from "@/components/player/utils/captions"; import { CaptionListItem } from "@/stores/player/slices/source"; import { SimpleCache } from "@/utils/cache"; @@ -62,13 +65,14 @@ export async function downloadCaption( } if (!data) throw new Error("failed to get caption data"); - // Ensure the data is in UTF-8 + // Ensure the data is in UTF-8 and fix any encoding issues const encoder = new TextEncoder(); const decoder = new TextDecoder("utf-8"); const utf8Bytes = encoder.encode(data); const utf8Data = decoder.decode(utf8Bytes); + const fixedData = fixUTF8Encoding(utf8Data); - const output = convertSubtitlesToSrt(utf8Data); + const output = convertSubtitlesToSrt(fixedData); downloadCache.set(caption.url, output, expirySeconds); return output; } @@ -93,11 +97,12 @@ export async function downloadWebVTT(url: string): Promise { const decoder = new TextDecoder(charset); const data = decoder.decode(buffer); - // Ensure the data is in UTF-8 + // Ensure the data is in UTF-8 and fix any encoding issues const encoder = new TextEncoder(); const utf8Bytes = encoder.encode(data); const utf8Data = decoder.decode(utf8Bytes); + const fixedData = fixUTF8Encoding(utf8Data); - downloadCache.set(url, utf8Data, expirySeconds); - return utf8Data; + downloadCache.set(url, fixedData, expirySeconds); + return fixedData; } diff --git a/src/components/player/atoms/settings/CaptionsView.tsx b/src/components/player/atoms/settings/CaptionsView.tsx index 3808a360..2c3fcbeb 100644 --- a/src/components/player/atoms/settings/CaptionsView.tsx +++ b/src/components/player/atoms/settings/CaptionsView.tsx @@ -10,6 +10,7 @@ import { Icon, Icons } from "@/components/Icon"; import { useCaptions } from "@/components/player/hooks/useCaptions"; import { Menu } from "@/components/player/internals/ContextMenu"; import { SelectableLink } from "@/components/player/internals/ContextMenu/Links"; +import { fixUTF8Encoding } from "@/components/player/utils/captions"; import { useOverlayRouter } from "@/hooks/useOverlayRouter"; import { usePlayerStore } from "@/stores/player/store"; import { useSubtitleStore } from "@/stores/subtitles"; @@ -151,13 +152,14 @@ export function CustomCaptionOption() { if (!event.target || typeof event.target.result !== "string") return; - // Ensure the data is in UTF-8 + // Ensure the data is in UTF-8 and fix any encoding issues const encoder = new TextEncoder(); const decoder = new TextDecoder("utf-8"); const utf8Bytes = encoder.encode(event.target.result); const utf8Data = decoder.decode(utf8Bytes); + const fixedData = fixUTF8Encoding(utf8Data); - const converted = convert(utf8Data, "srt"); + const converted = convert(fixedData, "srt"); setCaption({ language: "custom", srtData: converted, @@ -203,13 +205,14 @@ export function CaptionsView({ reader.addEventListener("load", (e) => { if (!e.target || typeof e.target.result !== "string") return; - // Ensure the data is in UTF-8 + // Ensure the data is in UTF-8 and fix any encoding issues const encoder = new TextEncoder(); const decoder = new TextDecoder("utf-8"); const utf8Bytes = encoder.encode(e.target.result); const utf8Data = decoder.decode(utf8Bytes); + const fixedData = fixUTF8Encoding(utf8Data); - const converted = convert(utf8Data, "srt"); + const converted = convert(fixedData, "srt"); setCaption({ language: "custom", diff --git a/src/components/player/utils/captions.ts b/src/components/player/utils/captions.ts index 09f79fbe..68247bb6 100644 --- a/src/components/player/utils/captions.ts +++ b/src/components/player/utils/captions.ts @@ -8,6 +8,63 @@ import { CaptionListItem } from "@/stores/player/slices/source"; export type CaptionCueType = ContentCaption; export const sanitize = DOMPurify.sanitize; +// UTF-8 character mapping for fixing corrupted special characters +const utf8Map: Record = { + "ä": "ä", + "Ä": "Ä", + "ä": "ä", + "Ä": "Ä", + "ö": "ö", + "ö": "ö", + "Ã¥": "å", + "Ã¥": "å", + "é": "é", + "é": "é", + ú: "ú", + ú: "ú", + "ñ": "ñ", + "ñ": "ñ", + "á": "á", + "á": "á", + "í": "í", + "í": "í", + "ó": "ó", + "ó": "ó", + "ü": "ü", + "ü": "ü", + "ç": "ç", + "ç": "ç", + "è": "è", + "è": "è", + "ì": "ì", + "ì": "ì", + "ò": "ò", + "ò": "ò", + "ù": "ù", + "ù": "ù", + ÃÂ: "à", + Ã: "à", + "Â": "", + Â: "", + " ": "", +}; + +/** + * Fixes UTF-8 encoding issues in subtitle text + * Handles common cases where special characters and accents get corrupted + * + * Example: + * Input: "Hyvä on, ohjelma oli tässä." + * Output: "Hyvä on, ohjelma oli tässä." + */ +export function fixUTF8Encoding(text: string): string { + let fixedText = text; + Object.keys(utf8Map).forEach((bad) => { + fixedText = fixedText.split(bad).join(utf8Map[bad]); + }); + return fixedText; +} + export function captionIsVisible( start: number, end: number, @@ -31,7 +88,9 @@ export function convertSubtitlesToVtt(text: string): string { if (textTrimmed === "") { throw new Error("Given text is empty"); } - const vtt = convert(textTrimmed, "vtt"); + // Fix UTF-8 encoding issues before conversion + const fixedText = fixUTF8Encoding(textTrimmed); + const vtt = convert(fixedText, "vtt"); if (detect(vtt) === "") { throw new Error("Invalid subtitle format"); } @@ -43,7 +102,9 @@ export function convertSubtitlesToSrt(text: string): string { if (textTrimmed === "") { throw new Error("Given text is empty"); } - const srt = convert(textTrimmed, "srt"); + // Fix UTF-8 encoding issues before conversion + const fixedText = fixUTF8Encoding(textTrimmed); + const srt = convert(fixedText, "srt"); if (detect(srt) === "") { throw new Error("Invalid subtitle format"); }