mirror of
https://github.com/hoornet/vega.git
synced 2026-05-06 20:29:12 -07:00
72 lines
1.8 KiB
TypeScript
72 lines
1.8 KiB
TypeScript
// Unicode script detection for feed filtering
|
|
|
|
const SCRIPT_RANGES: [string, RegExp][] = [
|
|
["Latin", /[\u0041-\u024F\u1E00-\u1EFF]/],
|
|
["CJK", /[\u2E80-\u2FFF\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF\uFE30-\uFE4F\uFF00-\uFFEF]|[\uD840-\uD87F][\uDC00-\uDFFF]/],
|
|
["Cyrillic", /[\u0400-\u04FF\u0500-\u052F]/],
|
|
["Arabic", /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/],
|
|
["Devanagari", /[\u0900-\u097F]/],
|
|
["Thai", /[\u0E00-\u0E7F]/],
|
|
["Korean", /[\uAC00-\uD7AF\u1100-\u11FF]/],
|
|
["Hebrew", /[\u0590-\u05FF]/],
|
|
["Greek", /[\u0370-\u03FF]/],
|
|
["Georgian", /[\u10A0-\u10FF]/],
|
|
["Armenian", /[\u0530-\u058F]/],
|
|
];
|
|
|
|
export function detectScript(text: string): string {
|
|
// Strip URLs, mentions, hashtags to avoid noise
|
|
const cleaned = text
|
|
.replace(/https?:\/\/\S+/g, "")
|
|
.replace(/nostr:\S+/g, "")
|
|
.replace(/#\w+/g, "")
|
|
.trim();
|
|
|
|
if (!cleaned) return "Unknown";
|
|
|
|
// Count characters per script
|
|
const counts = new Map<string, number>();
|
|
for (const char of cleaned) {
|
|
for (const [name, regex] of SCRIPT_RANGES) {
|
|
if (regex.test(char)) {
|
|
counts.set(name, (counts.get(name) ?? 0) + 1);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (counts.size === 0) return "Unknown";
|
|
|
|
// Return dominant script
|
|
let maxScript = "Unknown";
|
|
let maxCount = 0;
|
|
for (const [script, count] of counts) {
|
|
if (count > maxCount) {
|
|
maxScript = script;
|
|
maxCount = count;
|
|
}
|
|
}
|
|
|
|
return maxScript;
|
|
}
|
|
|
|
// Check NIP-32 language tags on an event
|
|
export function getEventLanguageTag(tags: string[][]): string | null {
|
|
const langTag = tags.find(
|
|
(t) => t[0] === "l" && t[2] === "ISO-639-1"
|
|
);
|
|
return langTag?.[1] ?? null;
|
|
}
|
|
|
|
export const FILTER_SCRIPTS = [
|
|
"Latin",
|
|
"CJK",
|
|
"Cyrillic",
|
|
"Arabic",
|
|
"Devanagari",
|
|
"Thai",
|
|
"Korean",
|
|
"Hebrew",
|
|
"Greek",
|
|
] as const;
|