Files
SpineViewer/Spine/Utils/Utf8Validator.cs
2025-05-27 15:55:10 +08:00

100 lines
3.4 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Spine.Utils
{
/// <summary>
/// UTF8 格式检测工具类
/// </summary>
public static class Utf8Validator
{
/// <summary>
/// 判断某段数据是否是 UTF8 格式, 会忽略尾部不完整数据
/// </summary>
public static bool IsUtf8(byte[] data, int maxLength = 1024)
{
int length = Math.Min(data.Length, maxLength);
int start = 0;
if (length >= 3 && data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF)
{
start = 3;
}
int expectedContinuationBytes = 0;
for (int i = start; i < length; i++)
{
byte currentByte = data[i];
if (expectedContinuationBytes == 0)
{
if ((currentByte & 0x80) == 0x00)
{
// 0xxxxxxxASCII 字符
continue;
}
// 计算需要的续字节数
int needed;
if ((currentByte & 0xE0) == 0xC0)
{
// 110xxxxx1 个续字节
if (currentByte == 0xC0 || currentByte == 0xC1)
return false; // 避免过长编码
needed = 1;
}
else if ((currentByte & 0xF0) == 0xE0)
{
// 1110xxxx2 个续字节
needed = 2;
}
else if ((currentByte & 0xF8) == 0xF0)
{
// 11110xxx3 个续字节
if (currentByte > 0xF4)
return false; // 超出 Unicode 范围
needed = 3;
}
else
{
// 非法的起始字节
return false;
}
// 如果剩余字节不足以完成这个字符,就当作“尾部不完整”,跳出主循环
if (i + needed >= length)
break;
// 否则进入续字节检查
expectedContinuationBytes = needed;
}
else
{
// 检查续字节10xxxxxx
if ((currentByte & 0xC0) != 0x80)
return false;
expectedContinuationBytes--;
}
}
// 如果在跳出时expectedContinuationBytes>0说明我们跳过了一些尾部续字节
// 本着“忽略尾部不完整字符”的原则,仍然返回 true
return expectedContinuationBytes == 0;
}
/// <summary>
/// 判断某个文件是否是 UTF8 格式, 会忽略尾部不完整数据
/// </summary>
public static bool IsUtf8(string path, int maxLength = 1024)
{
using var stream = File.OpenRead(path);
byte[] data = new byte[maxLength];
var actualLength = stream.Read(data, 0, data.Length);
return IsUtf8(data, actualLength);
}
}
}