Rollback to use old text sanitizer

This commit is contained in:
2025-07-21 19:34:59 +08:00
parent 44ec076e59
commit 11fd0c011b

View File

@@ -6,35 +6,23 @@ namespace DysonNetwork.Shared.Content;
public abstract partial class TextSanitizer public abstract partial class TextSanitizer
{ {
[GeneratedRegex(@"[\u0000-\u001F\u007F\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFFF0-\uFFFF]")]
private static partial Regex WeirdUnicodeRegex();
[GeneratedRegex(@"[\r\n]{2,}")]
private static partial Regex MultiNewlineRegex();
public static string? Sanitize(string? text) public static string? Sanitize(string? text)
{ {
if (text is null) return null; if (string.IsNullOrEmpty(text)) return text;
// Normalize weird Unicode characters // List of control characters to preserve
var cleaned = WeirdUnicodeRegex().Replace(text, ""); var preserveControlChars = new[] { '\n', '\r', '\t', ' ' };
// Normalize bold/italic/fancy unicode letters to ASCII var filtered = new StringBuilder();
cleaned = NormalizeFancyUnicode(cleaned); foreach (var ch in from ch in text
let category = CharUnicodeInfo.GetUnicodeCategory(ch)
where category is not UnicodeCategory.Control || preserveControlChars.Contains(ch)
where category is not (UnicodeCategory.Format or UnicodeCategory.NonSpacingMark)
select ch)
{
filtered.Append(ch);
}
// Replace multiple newlines with a single newline return filtered.ToString();
cleaned = MultiNewlineRegex().Replace(cleaned, "\n");
return cleaned;
}
private static string NormalizeFancyUnicode(string input)
{
var sb = new StringBuilder(input.Length);
foreach (var c in input.Normalize(NormalizationForm.FormC).Where(c =>
char.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark))
sb.Append(c);
return sb.ToString();
} }
} }