using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace Spine.Utils { /// /// UTF8 格式检测工具类 /// public static class Utf8Validator { /// /// 判断某段数据是否是 UTF8 格式, 会忽略尾部不完整数据 /// public static bool IsUtf8(byte[] data, int maxLength = 1024) { int length = Math.Min(data.Length, maxLength); int start = 0; if (length >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) { start = 3; } int expectedContinuationBytes = 0; for (int i = start; i < length; i++) { byte currentByte = data[i]; if (expectedContinuationBytes == 0) { if ((currentByte & 0x80) == 0x00) { // 0xxxxxxx,ASCII 字符 continue; } // 计算需要的续字节数 int needed; if ((currentByte & 0xE0) == 0xC0) { // 110xxxxx,1 个续字节 if (currentByte == 0xC0 || currentByte == 0xC1) return false; // 避免过长编码 needed = 1; } else if ((currentByte & 0xF0) == 0xE0) { // 1110xxxx,2 个续字节 needed = 2; } else if ((currentByte & 0xF8) == 0xF0) { // 11110xxx,3 个续字节 if (currentByte > 0xF4) return false; // 超出 Unicode 范围 needed = 3; } else { // 非法的起始字节 return false; } // 如果剩余字节不足以完成这个字符,就当作“尾部不完整”,跳出主循环 if (i + needed >= length) break; // 否则进入续字节检查 expectedContinuationBytes = needed; } else { // 检查续字节(10xxxxxx) if ((currentByte & 0xC0) != 0x80) return false; expectedContinuationBytes--; } } // 如果在跳出时,expectedContinuationBytes>0,说明我们跳过了一些尾部续字节, // 本着“忽略尾部不完整字符”的原则,仍然返回 true return expectedContinuationBytes == 0; } /// /// 判断某个文件是否是 UTF8 格式, 会忽略尾部不完整数据 /// public static bool IsUtf8(string path, int maxLength = 1024) { using var stream = File.OpenRead(path); byte[] data = new byte[maxLength]; var actualLength = stream.Read(data, 0, data.Length); return IsUtf8(data, actualLength); } } }