TxT2Docx/error_chars.py
2025-09-21 19:01:40 +08:00

323 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
错别字处理模块
负责错别字的加载、管理和文本错误引入功能。
支持自定义错别字库,可按强度控制错误引入比例。
"""
import os
import json
import random
import re
from typing import Dict, List, Tuple
class ErrorCharProcessor:
"""错别字处理器类"""
def __init__(self, db_path: str = "data/error_chars.json"):
"""
初始化错别字处理器
Args:
db_path: 错别字库文件路径
"""
self.db_path = db_path
self.error_chars = self.load_error_chars()
def load_error_chars(self) -> Dict[str, str]:
"""
加载错别字库
Returns:
Dict[str, str]: 错别字映射字典 {正确字: 错误字}
"""
# 检查文件夹是否存在,不存在则创建
dir_name = os.path.dirname(self.db_path)
if dir_name and not os.path.exists(dir_name):
os.makedirs(dir_name)
print(f"加载错别字库文件: {self.db_path}")
# 检查文件是否存在,不存在则创建默认库
if not os.path.exists(self.db_path):
default_chars = self._get_default_error_chars()
self.save_error_chars(default_chars)
return default_chars
# 加载已存在的错别字库
try:
with open(self.db_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"加载错别字库失败: {e}")
# 如果加载失败,返回默认库
return self._get_default_error_chars()
def save_error_chars(self, error_chars: Dict[str, str]) -> bool:
"""
保存错别字库到文件
Args:
error_chars: 错别字映射字典
Returns:
bool: 是否保存成功
"""
try:
# 确保目录存在
dir_name = os.path.dirname(self.db_path)
if dir_name and not os.path.exists(dir_name):
os.makedirs(dir_name)
with open(self.db_path, 'w', encoding='utf-8') as f:
json.dump(error_chars, f, ensure_ascii=False, indent=2)
return True
except Exception as e:
print(f"保存错别字库失败: {e}")
return False
def _get_default_error_chars(self) -> Dict[str, str]:
"""
获取默认错别字库
Returns:
Dict[str, str]: 默认错别字映射
"""
return {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
}
def introduce_char_errors(self, text: str, intensity: float = 1.0) -> Tuple[str, int, List[str], List[str]]:
"""
将文本中的正确单字替换为常见错误单字
Args:
text: 要处理的文本
intensity: 错误引入强度0.0-1.0之间1.0表示替换所有可能的字
Returns:
Tuple[str, int, List[str], List[str]]:
处理后的文本、替换的总数量、原句列表、处理后的句子列表
"""
if not text or intensity <= 0:
return text, 0, [], []
# 句子拆分
original_sentences = self._split_into_sentences(text)
modified_sentences = []
total_replace = 0
for sentence in original_sentences:
modified, count = self._introduce_errors_to_sentence(sentence, intensity)
modified_sentences.append(modified)
total_replace += count
modified_text = ''.join(modified_sentences)
return modified_text, total_replace, original_sentences, modified_sentences
def _split_into_sentences(self, text: str) -> List[str]:
"""
句子拆分函数
Args:
text: 要拆分的文本
Returns:
List[str]: 拆分后的句子列表
"""
separators = re.compile(r'([。!?;,.!?;])')
parts = separators.split(text)
sentences = []
for i in range(0, len(parts) - 1, 2):
if parts[i] or parts[i + 1]:
sentences.append(parts[i] + parts[i + 1])
if len(parts) % 2 == 1 and parts[-1]:
sentences.append(parts[-1])
return sentences
def _introduce_errors_to_sentence(self, sentence: str, intensity: float) -> Tuple[str, int]:
"""
单句错误引入函数
Args:
sentence: 要处理的句子
intensity: 错误引入强度
Returns:
Tuple[str, int]: 处理后的句子和替换数量
"""
modified = list(sentence)
replace_count = 0
for i, char in enumerate(modified):
if char in self.error_chars and random.random() <= intensity:
modified[i] = self.error_chars[char]
replace_count += 1
return ''.join(modified), replace_count
def add_error_mapping(self, correct_char: str, error_char: str) -> None:
"""
添加错别字映射
Args:
correct_char: 正确字符
error_char: 错误字符
"""
self.error_chars[correct_char] = error_char
def remove_error_mapping(self, correct_char: str) -> bool:
"""
删除错别字映射
Args:
correct_char: 要删除的正确字符
Returns:
bool: 是否删除成功
"""
if correct_char in self.error_chars:
del self.error_chars[correct_char]
return True
return False
def get_error_chars(self) -> Dict[str, str]:
"""
获取当前错别字映射
Returns:
Dict[str, str]: 错别字映射字典
"""
return self.error_chars.copy()
def update_error_chars(self, new_error_chars: Dict[str, str]) -> None:
"""
更新错别字映射
Args:
new_error_chars: 新的错别字映射
"""
self.error_chars.update(new_error_chars)
def clear_error_chars(self) -> None:
"""清空所有错别字映射"""
self.error_chars.clear()
def reset_to_default(self) -> None:
"""重置为默认错别字库"""
self.error_chars = self._get_default_error_chars()
def get_statistics(self, text: str, intensity: float = 1.0) -> Dict[str, int]:
"""
获取文本错误引入统计信息(不实际修改文本)
Args:
text: 要统计的文本
intensity: 错误引入强度
Returns:
Dict[str, int]: 统计信息
"""
if not text:
return {"total_chars": 0, "replaceable_chars": 0, "estimated_replacements": 0}
total_chars = len(text)
replaceable_chars = sum(1 for char in text if char in self.error_chars)
estimated_replacements = int(replaceable_chars * intensity)
return {
"total_chars": total_chars,
"replaceable_chars": replaceable_chars,
"estimated_replacements": estimated_replacements
}
def create_error_processor(db_path: str = "data/error_chars.json") -> ErrorCharProcessor:
"""
创建错别字处理器实例的工厂函数
Args:
db_path: 错别字库文件路径
Returns:
ErrorCharProcessor: 错别字处理器实例
"""
return ErrorCharProcessor(db_path)
# 兼容旧接口的函数
def load_error_chars(db_path: str = "data/error_chars.json") -> Dict[str, str]:
"""
加载错别字库(兼容旧接口)
Args:
db_path: 错别字库文件路径
Returns:
Dict[str, str]: 错别字映射字典
"""
processor = ErrorCharProcessor(db_path)
return processor.get_error_chars()
def introduce_char_errors(text: str, intensity: float = 1.0, db_path: str = "data/error_chars.json") -> Tuple[str, int, List[str], List[str]]:
"""
将文本中的正确单字替换为常见错误单字(兼容旧接口)
Args:
text: 要处理的文本
intensity: 错误引入强度0.0-1.0之间
db_path: 错别字库文件路径
Returns:
Tuple[str, int, List[str], List[str]]:
处理后的文本、替换的总数量、原句列表、处理后的句子列表
"""
processor = ErrorCharProcessor(db_path)
return processor.introduce_char_errors(text, intensity)