attempt to bruteforce fix utf8 subs

This commit is contained in:
Pas 2025-08-01 15:23:19 -06:00
parent 8f5728ebf0
commit 2d8b4f99fd
3 changed files with 81 additions and 12 deletions

View file

@ -1,7 +1,10 @@
import { list } from "subsrt-ts";
import { proxiedFetch } from "@/backend/helpers/fetch";
import { convertSubtitlesToSrt } from "@/components/player/utils/captions";
import {
convertSubtitlesToSrt,
fixUTF8Encoding,
} from "@/components/player/utils/captions";
import { CaptionListItem } from "@/stores/player/slices/source";
import { SimpleCache } from "@/utils/cache";
@ -62,13 +65,14 @@ export async function downloadCaption(
}
if (!data) throw new Error("failed to get caption data");
// Ensure the data is in UTF-8
// Ensure the data is in UTF-8 and fix any encoding issues
const encoder = new TextEncoder();
const decoder = new TextDecoder("utf-8");
const utf8Bytes = encoder.encode(data);
const utf8Data = decoder.decode(utf8Bytes);
const fixedData = fixUTF8Encoding(utf8Data);
const output = convertSubtitlesToSrt(utf8Data);
const output = convertSubtitlesToSrt(fixedData);
downloadCache.set(caption.url, output, expirySeconds);
return output;
}
@ -93,11 +97,12 @@ export async function downloadWebVTT(url: string): Promise<string> {
const decoder = new TextDecoder(charset);
const data = decoder.decode(buffer);
// Ensure the data is in UTF-8
// Ensure the data is in UTF-8 and fix any encoding issues
const encoder = new TextEncoder();
const utf8Bytes = encoder.encode(data);
const utf8Data = decoder.decode(utf8Bytes);
const fixedData = fixUTF8Encoding(utf8Data);
downloadCache.set(url, utf8Data, expirySeconds);
return utf8Data;
downloadCache.set(url, fixedData, expirySeconds);
return fixedData;
}

View file

@ -10,6 +10,7 @@ import { Icon, Icons } from "@/components/Icon";
import { useCaptions } from "@/components/player/hooks/useCaptions";
import { Menu } from "@/components/player/internals/ContextMenu";
import { SelectableLink } from "@/components/player/internals/ContextMenu/Links";
import { fixUTF8Encoding } from "@/components/player/utils/captions";
import { useOverlayRouter } from "@/hooks/useOverlayRouter";
import { usePlayerStore } from "@/stores/player/store";
import { useSubtitleStore } from "@/stores/subtitles";
@ -151,13 +152,14 @@ export function CustomCaptionOption() {
if (!event.target || typeof event.target.result !== "string")
return;
// Ensure the data is in UTF-8
// Ensure the data is in UTF-8 and fix any encoding issues
const encoder = new TextEncoder();
const decoder = new TextDecoder("utf-8");
const utf8Bytes = encoder.encode(event.target.result);
const utf8Data = decoder.decode(utf8Bytes);
const fixedData = fixUTF8Encoding(utf8Data);
const converted = convert(utf8Data, "srt");
const converted = convert(fixedData, "srt");
setCaption({
language: "custom",
srtData: converted,
@ -203,13 +205,14 @@ export function CaptionsView({
reader.addEventListener("load", (e) => {
if (!e.target || typeof e.target.result !== "string") return;
// Ensure the data is in UTF-8
// Ensure the data is in UTF-8 and fix any encoding issues
const encoder = new TextEncoder();
const decoder = new TextDecoder("utf-8");
const utf8Bytes = encoder.encode(e.target.result);
const utf8Data = decoder.decode(utf8Bytes);
const fixedData = fixUTF8Encoding(utf8Data);
const converted = convert(utf8Data, "srt");
const converted = convert(fixedData, "srt");
setCaption({
language: "custom",

View file

@ -8,6 +8,63 @@ import { CaptionListItem } from "@/stores/player/slices/source";
export type CaptionCueType = ContentCaption;
export const sanitize = DOMPurify.sanitize;
// UTF-8 character mapping for fixing corrupted special characters
const utf8Map: Record<string, string> = {
"ä": "ä",
"Ä": "Ä",
"ä": "ä",
"Ä": "Ä",
"ö": "ö",
"ö": "ö",
"Ã¥": "å",
"Ã¥": "å",
"é": "é",
"é": "é",
ú: "ú",
ú: "ú",
"ñ": "ñ",
"ñ": "ñ",
"á": "á",
"á": "á",
"í": "í",
"í": "í",
"ó": "ó",
"ó": "ó",
"ü": "ü",
"ü": "ü",
"ç": "ç",
"ç": "ç",
"è": "è",
"è": "è",
"ì": "ì",
"ì": "ì",
"ò": "ò",
"ò": "ò",
"ù": "ù",
"ù": "ù",
ÃÂ: "à",
Ã: "à",
"Â": "",
Â: "",
"Â ": "",
};
/**
* Fixes UTF-8 encoding issues in subtitle text
* Handles common cases where special characters and accents get corrupted
*
* Example:
* Input: "Hyvä on, ohjelma oli tässä."
* Output: "Hyvä on, ohjelma oli tässä."
*/
export function fixUTF8Encoding(text: string): string {
let fixedText = text;
Object.keys(utf8Map).forEach((bad) => {
fixedText = fixedText.split(bad).join(utf8Map[bad]);
});
return fixedText;
}
export function captionIsVisible(
start: number,
end: number,
@ -31,7 +88,9 @@ export function convertSubtitlesToVtt(text: string): string {
if (textTrimmed === "") {
throw new Error("Given text is empty");
}
const vtt = convert(textTrimmed, "vtt");
// Fix UTF-8 encoding issues before conversion
const fixedText = fixUTF8Encoding(textTrimmed);
const vtt = convert(fixedText, "vtt");
if (detect(vtt) === "") {
throw new Error("Invalid subtitle format");
}
@ -43,7 +102,9 @@ export function convertSubtitlesToSrt(text: string): string {
if (textTrimmed === "") {
throw new Error("Given text is empty");
}
const srt = convert(textTrimmed, "srt");
// Fix UTF-8 encoding issues before conversion
const fixedText = fixUTF8Encoding(textTrimmed);
const srt = convert(fixedText, "srt");
if (detect(srt) === "") {
throw new Error("Invalid subtitle format");
}