323 lines
9.6 KiB
Python
323 lines
9.6 KiB
Python
"""
|
||
错别字处理模块
|
||
|
||
负责错别字的加载、管理和文本错误引入功能。
|
||
支持自定义错别字库,可按强度控制错误引入比例。
|
||
"""
|
||
|
||
import os
|
||
import json
|
||
import random
|
||
import re
|
||
from typing import Dict, List, Tuple
|
||
|
||
|
||
class ErrorCharProcessor:
|
||
"""错别字处理器类"""
|
||
|
||
def __init__(self, db_path: str = "data/error_chars.json"):
|
||
"""
|
||
初始化错别字处理器
|
||
|
||
Args:
|
||
db_path: 错别字库文件路径
|
||
"""
|
||
self.db_path = db_path
|
||
self.error_chars = self.load_error_chars()
|
||
|
||
def load_error_chars(self) -> Dict[str, str]:
|
||
"""
|
||
加载错别字库
|
||
|
||
Returns:
|
||
Dict[str, str]: 错别字映射字典 {正确字: 错误字}
|
||
"""
|
||
# 检查文件夹是否存在,不存在则创建
|
||
dir_name = os.path.dirname(self.db_path)
|
||
if dir_name and not os.path.exists(dir_name):
|
||
os.makedirs(dir_name)
|
||
|
||
print(f"加载错别字库文件: {self.db_path}")
|
||
|
||
# 检查文件是否存在,不存在则创建默认库
|
||
if not os.path.exists(self.db_path):
|
||
default_chars = self._get_default_error_chars()
|
||
self.save_error_chars(default_chars)
|
||
return default_chars
|
||
|
||
# 加载已存在的错别字库
|
||
try:
|
||
with open(self.db_path, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
except Exception as e:
|
||
print(f"加载错别字库失败: {e}")
|
||
# 如果加载失败,返回默认库
|
||
return self._get_default_error_chars()
|
||
|
||
def save_error_chars(self, error_chars: Dict[str, str]) -> bool:
|
||
"""
|
||
保存错别字库到文件
|
||
|
||
Args:
|
||
error_chars: 错别字映射字典
|
||
|
||
Returns:
|
||
bool: 是否保存成功
|
||
"""
|
||
try:
|
||
# 确保目录存在
|
||
dir_name = os.path.dirname(self.db_path)
|
||
if dir_name and not os.path.exists(dir_name):
|
||
os.makedirs(dir_name)
|
||
|
||
with open(self.db_path, 'w', encoding='utf-8') as f:
|
||
json.dump(error_chars, f, ensure_ascii=False, indent=2)
|
||
return True
|
||
except Exception as e:
|
||
print(f"保存错别字库失败: {e}")
|
||
return False
|
||
|
||
def _get_default_error_chars(self) -> Dict[str, str]:
|
||
"""
|
||
获取默认错别字库
|
||
|
||
Returns:
|
||
Dict[str, str]: 默认错别字映射
|
||
"""
|
||
return {
|
||
"的": "地",
|
||
"地": "得",
|
||
"得": "的",
|
||
"在": "再",
|
||
"再": "在",
|
||
"是": "事",
|
||
"事": "是",
|
||
"他": "她",
|
||
"她": "他",
|
||
"你": "您",
|
||
"您": "你",
|
||
"们": "门",
|
||
"门": "们",
|
||
"有": "又",
|
||
"又": "有",
|
||
"和": "合",
|
||
"合": "和",
|
||
"到": "倒",
|
||
"倒": "到",
|
||
"就": "才",
|
||
"才": "就",
|
||
"要": "耍",
|
||
"耍": "要",
|
||
"会": "汇",
|
||
"汇": "会",
|
||
"看": "着",
|
||
"着": "看",
|
||
"说": "讲",
|
||
"讲": "说",
|
||
"做": "作",
|
||
"作": "做",
|
||
"已": "己",
|
||
"己": "已",
|
||
"以": "已",
|
||
"已": "以",
|
||
"进": "近",
|
||
"近": "进",
|
||
"象": "像",
|
||
"像": "象",
|
||
"对": "队",
|
||
"队": "对",
|
||
"分": "份",
|
||
"份": "分",
|
||
}
|
||
|
||
def introduce_char_errors(self, text: str, intensity: float = 1.0) -> Tuple[str, int, List[str], List[str]]:
|
||
"""
|
||
将文本中的正确单字替换为常见错误单字
|
||
|
||
Args:
|
||
text: 要处理的文本
|
||
intensity: 错误引入强度,0.0-1.0之间,1.0表示替换所有可能的字
|
||
|
||
Returns:
|
||
Tuple[str, int, List[str], List[str]]:
|
||
处理后的文本、替换的总数量、原句列表、处理后的句子列表
|
||
"""
|
||
if not text or intensity <= 0:
|
||
return text, 0, [], []
|
||
|
||
# 句子拆分
|
||
original_sentences = self._split_into_sentences(text)
|
||
modified_sentences = []
|
||
total_replace = 0
|
||
|
||
for sentence in original_sentences:
|
||
modified, count = self._introduce_errors_to_sentence(sentence, intensity)
|
||
modified_sentences.append(modified)
|
||
total_replace += count
|
||
|
||
modified_text = ''.join(modified_sentences)
|
||
return modified_text, total_replace, original_sentences, modified_sentences
|
||
|
||
def _split_into_sentences(self, text: str) -> List[str]:
|
||
"""
|
||
句子拆分函数
|
||
|
||
Args:
|
||
text: 要拆分的文本
|
||
|
||
Returns:
|
||
List[str]: 拆分后的句子列表
|
||
"""
|
||
separators = re.compile(r'([。!?;,.!?;])')
|
||
parts = separators.split(text)
|
||
sentences = []
|
||
|
||
for i in range(0, len(parts) - 1, 2):
|
||
if parts[i] or parts[i + 1]:
|
||
sentences.append(parts[i] + parts[i + 1])
|
||
|
||
if len(parts) % 2 == 1 and parts[-1]:
|
||
sentences.append(parts[-1])
|
||
|
||
return sentences
|
||
|
||
def _introduce_errors_to_sentence(self, sentence: str, intensity: float) -> Tuple[str, int]:
|
||
"""
|
||
单句错误引入函数
|
||
|
||
Args:
|
||
sentence: 要处理的句子
|
||
intensity: 错误引入强度
|
||
|
||
Returns:
|
||
Tuple[str, int]: 处理后的句子和替换数量
|
||
"""
|
||
modified = list(sentence)
|
||
replace_count = 0
|
||
|
||
for i, char in enumerate(modified):
|
||
if char in self.error_chars and random.random() <= intensity:
|
||
modified[i] = self.error_chars[char]
|
||
replace_count += 1
|
||
|
||
return ''.join(modified), replace_count
|
||
|
||
def add_error_mapping(self, correct_char: str, error_char: str) -> None:
|
||
"""
|
||
添加错别字映射
|
||
|
||
Args:
|
||
correct_char: 正确字符
|
||
error_char: 错误字符
|
||
"""
|
||
self.error_chars[correct_char] = error_char
|
||
|
||
def remove_error_mapping(self, correct_char: str) -> bool:
|
||
"""
|
||
删除错别字映射
|
||
|
||
Args:
|
||
correct_char: 要删除的正确字符
|
||
|
||
Returns:
|
||
bool: 是否删除成功
|
||
"""
|
||
if correct_char in self.error_chars:
|
||
del self.error_chars[correct_char]
|
||
return True
|
||
return False
|
||
|
||
def get_error_chars(self) -> Dict[str, str]:
|
||
"""
|
||
获取当前错别字映射
|
||
|
||
Returns:
|
||
Dict[str, str]: 错别字映射字典
|
||
"""
|
||
return self.error_chars.copy()
|
||
|
||
def update_error_chars(self, new_error_chars: Dict[str, str]) -> None:
|
||
"""
|
||
更新错别字映射
|
||
|
||
Args:
|
||
new_error_chars: 新的错别字映射
|
||
"""
|
||
self.error_chars.update(new_error_chars)
|
||
|
||
def clear_error_chars(self) -> None:
|
||
"""清空所有错别字映射"""
|
||
self.error_chars.clear()
|
||
|
||
def reset_to_default(self) -> None:
|
||
"""重置为默认错别字库"""
|
||
self.error_chars = self._get_default_error_chars()
|
||
|
||
def get_statistics(self, text: str, intensity: float = 1.0) -> Dict[str, int]:
|
||
"""
|
||
获取文本错误引入统计信息(不实际修改文本)
|
||
|
||
Args:
|
||
text: 要统计的文本
|
||
intensity: 错误引入强度
|
||
|
||
Returns:
|
||
Dict[str, int]: 统计信息
|
||
"""
|
||
if not text:
|
||
return {"total_chars": 0, "replaceable_chars": 0, "estimated_replacements": 0}
|
||
|
||
total_chars = len(text)
|
||
replaceable_chars = sum(1 for char in text if char in self.error_chars)
|
||
estimated_replacements = int(replaceable_chars * intensity)
|
||
|
||
return {
|
||
"total_chars": total_chars,
|
||
"replaceable_chars": replaceable_chars,
|
||
"estimated_replacements": estimated_replacements
|
||
}
|
||
|
||
|
||
def create_error_processor(db_path: str = "data/error_chars.json") -> ErrorCharProcessor:
|
||
"""
|
||
创建错别字处理器实例的工厂函数
|
||
|
||
Args:
|
||
db_path: 错别字库文件路径
|
||
|
||
Returns:
|
||
ErrorCharProcessor: 错别字处理器实例
|
||
"""
|
||
return ErrorCharProcessor(db_path)
|
||
|
||
|
||
# 兼容旧接口的函数
|
||
def load_error_chars(db_path: str = "data/error_chars.json") -> Dict[str, str]:
|
||
"""
|
||
加载错别字库(兼容旧接口)
|
||
|
||
Args:
|
||
db_path: 错别字库文件路径
|
||
|
||
Returns:
|
||
Dict[str, str]: 错别字映射字典
|
||
"""
|
||
processor = ErrorCharProcessor(db_path)
|
||
return processor.get_error_chars()
|
||
|
||
|
||
def introduce_char_errors(text: str, intensity: float = 1.0, db_path: str = "data/error_chars.json") -> Tuple[str, int, List[str], List[str]]:
|
||
"""
|
||
将文本中的正确单字替换为常见错误单字(兼容旧接口)
|
||
|
||
Args:
|
||
text: 要处理的文本
|
||
intensity: 错误引入强度,0.0-1.0之间
|
||
db_path: 错别字库文件路径
|
||
|
||
Returns:
|
||
Tuple[str, int, List[str], List[str]]:
|
||
处理后的文本、替换的总数量、原句列表、处理后的句子列表
|
||
"""
|
||
processor = ErrorCharProcessor(db_path)
|
||
return processor.introduce_char_errors(text, intensity) |