TxT2Docx/error_chars.py

323 lines
9.6 KiB
Python
Raw Normal View History

2025-09-21 19:01:40 +08:00
"""
错别字处理模块
负责错别字的加载管理和文本错误引入功能
支持自定义错别字库可按强度控制错误引入比例
"""
import os
import json
import random
import re
from typing import Dict, List, Tuple
class ErrorCharProcessor:
"""错别字处理器类"""
def __init__(self, db_path: str = "data/error_chars.json"):
"""
初始化错别字处理器
Args:
db_path: 错别字库文件路径
"""
self.db_path = db_path
self.error_chars = self.load_error_chars()
def load_error_chars(self) -> Dict[str, str]:
"""
加载错别字库
Returns:
Dict[str, str]: 错别字映射字典 {正确字: 错误字}
"""
# 检查文件夹是否存在,不存在则创建
dir_name = os.path.dirname(self.db_path)
if dir_name and not os.path.exists(dir_name):
os.makedirs(dir_name)
2025-09-21 20:40:36 +08:00
# print(f"加载错别字库文件: {self.db_path}")
2025-09-21 19:01:40 +08:00
# 检查文件是否存在,不存在则创建默认库
if not os.path.exists(self.db_path):
default_chars = self._get_default_error_chars()
self.save_error_chars(default_chars)
return default_chars
# 加载已存在的错别字库
try:
with open(self.db_path, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"加载错别字库失败: {e}")
# 如果加载失败,返回默认库
return self._get_default_error_chars()
def save_error_chars(self, error_chars: Dict[str, str]) -> bool:
"""
保存错别字库到文件
Args:
error_chars: 错别字映射字典
Returns:
bool: 是否保存成功
"""
try:
# 确保目录存在
dir_name = os.path.dirname(self.db_path)
if dir_name and not os.path.exists(dir_name):
os.makedirs(dir_name)
with open(self.db_path, 'w', encoding='utf-8') as f:
json.dump(error_chars, f, ensure_ascii=False, indent=2)
return True
except Exception as e:
print(f"保存错别字库失败: {e}")
return False
def _get_default_error_chars(self) -> Dict[str, str]:
"""
获取默认错别字库
Returns:
Dict[str, str]: 默认错别字映射
"""
return {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
}
def introduce_char_errors(self, text: str, intensity: float = 1.0) -> Tuple[str, int, List[str], List[str]]:
"""
将文本中的正确单字替换为常见错误单字
Args:
text: 要处理的文本
intensity: 错误引入强度0.0-1.0之间1.0表示替换所有可能的字
Returns:
Tuple[str, int, List[str], List[str]]:
处理后的文本替换的总数量原句列表处理后的句子列表
"""
if not text or intensity <= 0:
return text, 0, [], []
# 句子拆分
original_sentences = self._split_into_sentences(text)
modified_sentences = []
total_replace = 0
for sentence in original_sentences:
modified, count = self._introduce_errors_to_sentence(sentence, intensity)
modified_sentences.append(modified)
total_replace += count
modified_text = ''.join(modified_sentences)
return modified_text, total_replace, original_sentences, modified_sentences
def _split_into_sentences(self, text: str) -> List[str]:
"""
句子拆分函数
Args:
text: 要拆分的文本
Returns:
List[str]: 拆分后的句子列表
"""
separators = re.compile(r'([。!?;,.!?;])')
parts = separators.split(text)
sentences = []
for i in range(0, len(parts) - 1, 2):
if parts[i] or parts[i + 1]:
sentences.append(parts[i] + parts[i + 1])
if len(parts) % 2 == 1 and parts[-1]:
sentences.append(parts[-1])
return sentences
def _introduce_errors_to_sentence(self, sentence: str, intensity: float) -> Tuple[str, int]:
"""
单句错误引入函数
Args:
sentence: 要处理的句子
intensity: 错误引入强度
Returns:
Tuple[str, int]: 处理后的句子和替换数量
"""
modified = list(sentence)
replace_count = 0
for i, char in enumerate(modified):
if char in self.error_chars and random.random() <= intensity:
modified[i] = self.error_chars[char]
replace_count += 1
return ''.join(modified), replace_count
def add_error_mapping(self, correct_char: str, error_char: str) -> None:
"""
添加错别字映射
Args:
correct_char: 正确字符
error_char: 错误字符
"""
self.error_chars[correct_char] = error_char
def remove_error_mapping(self, correct_char: str) -> bool:
"""
删除错别字映射
Args:
correct_char: 要删除的正确字符
Returns:
bool: 是否删除成功
"""
if correct_char in self.error_chars:
del self.error_chars[correct_char]
return True
return False
def get_error_chars(self) -> Dict[str, str]:
"""
获取当前错别字映射
Returns:
Dict[str, str]: 错别字映射字典
"""
return self.error_chars.copy()
def update_error_chars(self, new_error_chars: Dict[str, str]) -> None:
"""
更新错别字映射
Args:
new_error_chars: 新的错别字映射
"""
self.error_chars.update(new_error_chars)
def clear_error_chars(self) -> None:
"""清空所有错别字映射"""
self.error_chars.clear()
def reset_to_default(self) -> None:
"""重置为默认错别字库"""
self.error_chars = self._get_default_error_chars()
def get_statistics(self, text: str, intensity: float = 1.0) -> Dict[str, int]:
"""
获取文本错误引入统计信息不实际修改文本
Args:
text: 要统计的文本
intensity: 错误引入强度
Returns:
Dict[str, int]: 统计信息
"""
if not text:
return {"total_chars": 0, "replaceable_chars": 0, "estimated_replacements": 0}
total_chars = len(text)
replaceable_chars = sum(1 for char in text if char in self.error_chars)
estimated_replacements = int(replaceable_chars * intensity)
return {
"total_chars": total_chars,
"replaceable_chars": replaceable_chars,
"estimated_replacements": estimated_replacements
}
def create_error_processor(db_path: str = "data/error_chars.json") -> ErrorCharProcessor:
"""
创建错别字处理器实例的工厂函数
Args:
db_path: 错别字库文件路径
Returns:
ErrorCharProcessor: 错别字处理器实例
"""
return ErrorCharProcessor(db_path)
# 兼容旧接口的函数
def load_error_chars(db_path: str = "data/error_chars.json") -> Dict[str, str]:
"""
加载错别字库兼容旧接口
Args:
db_path: 错别字库文件路径
Returns:
Dict[str, str]: 错别字映射字典
"""
processor = ErrorCharProcessor(db_path)
return processor.get_error_chars()
def introduce_char_errors(text: str, intensity: float = 1.0, db_path: str = "data/error_chars.json") -> Tuple[str, int, List[str], List[str]]:
"""
将文本中的正确单字替换为常见错误单字兼容旧接口
Args:
text: 要处理的文本
intensity: 错误引入强度0.0-1.0之间
db_path: 错别字库文件路径
Returns:
Tuple[str, int, List[str], List[str]]:
处理后的文本替换的总数量原句列表处理后的句子列表
"""
processor = ErrorCharProcessor(db_path)
return processor.introduce_char_errors(text, intensity)