TxT2Docx/text_processor.py

"""
文本处理模块

负责文本的各种处理功能，包括顺序调换、标点符号替换、错别字处理等。
"""

from typing import Optional
from config import config
from error_chars import ErrorCharProcessor
from replacestr import replace_text


class TextProcessor:
    """文本处理器类，统一处理各种文本操作"""
    
    def __init__(self):
        """初始化文本处理器"""
        self.error_processor = None
        self._init_error_processor()

    def _init_error_processor(self) -> None:
        """初始化错别字处理器"""
        if config.enable_char_errors:
            self.error_processor = ErrorCharProcessor(config.char_error_db_path)

    def replace_periods(self, text: str) -> str:
        """
        将中间出现的句号统一替换为逗号；
        若文本末尾是句号，则直接删除该句号。
        
        Args:
            text: 输入文本
            
        Returns:
            str: 处理后的文本
        """
        if not text:
            return ''

        text = text.rstrip()
        if not text:
            return ''

        # 去掉末尾句号（如果有）
        if text[-1] == '。':
            text = text[:-1]

        # 把剩余句号替换为逗号
        return text.replace('。', '，')

    def reverse_text_order(self, content: str) -> str:
        """
        反转文本顺序（按字符级反转）
        
        Args:
            content: 输入文本
            
        Returns:
            str: 反转后的文本
        """
        if not content:
            return content
        return content[::-1]

    def reverse_paragraph_order(self, content: str) -> str:
        """
        反转段落顺序（保留段落内文字顺序）
        
        Args:
            content: 输入文本
            
        Returns:
            str: 段落顺序反转后的文本
        """
        if not content:
            return content
        paragraphs = content.split('\n')
        return '\n'.join(reversed(paragraphs))

    def apply_char_errors(self, text: str) -> str:
        """
        应用错别字处理
        
        Args:
            text: 输入文本
            
        Returns:
            str: 处理后的文本
        """
        if not config.enable_char_errors or not text:
            return text

        try:
            # 重新初始化错别字处理器（以防配置改变）
            if not self.error_processor:
                self._init_error_processor()
            
            if self.error_processor:
                modified_text, replace_count, _, _ = self.error_processor.introduce_char_errors(
                    text, config.char_error_intensity
                )
                if replace_count > 0:
                    print(f"已应用错别字处理，替换了 {replace_count} 个字符。")
                return modified_text
            
        except Exception as e:
            # 如果错别字处理出错，返回原文本
            print(f"错别字处理出错: {e}")

        return text

    def apply_text_order_processing(self, text: str) -> str:
        """
        应用文字顺序处理
        
        Args:
            text: 输入文本
            
        Returns:
            str: 处理后的文本
        """
        if not config.reverse_text_order or not text:
            return text
        
        try:
            return replace_text(text)
        except Exception as e:
            print(f"文字顺序处理出错: {e}")
            return text

    def process_text_content(self, text: str) -> str:
        """
        统一处理文字内容：顺序调换、错别字处理和标点符号替换
        
        Args:
            text: 输入文本
            
        Returns:
            str: 处理后的文本
        """
        if not text or not text.strip():
            return text

        processed_text = text

        # 先进行文字顺序处理
        processed_text = self.apply_text_order_processing(processed_text)

        # 应用错别字处理
        processed_text = self.apply_char_errors(processed_text)

        # 最后进行标点符号替换
        if config.replace_punctuation:
            processed_text = self.replace_periods(processed_text)

        return processed_text

    def clean_text(self, text: str) -> str:
        """
        清理文本，去除多余的空白字符
        
        Args:
            text: 输入文本
            
        Returns:
            str: 清理后的文本
        """
        if not text:
            return text
            
        # 替换不同类型的换行符
        text = text.replace("\r\n", "\n").replace("\r", "\n")
        
        # 去除行尾空白
        lines = [line.rstrip() for line in text.split('\n')]
        
        return '\n'.join(lines)

    def normalize_text(self, text: str) -> str:
        """
        标准化文本格式
        
        Args:
            text: 输入文本
            
        Returns:
            str: 标准化后的文本
        """
        if not text:
            return text
            
        # 首先清理文本
        text = self.clean_text(text)
        
        # 统一中文标点符号
        punctuation_map = {
            '，': '，',  # 全角逗号
            '。': '。',  # 全角句号
            '！': '！',  # 全角感叹号
            '？': '？',  # 全角问号
            '；': '；',  # 全角分号
            '：': '：',  # 全角冒号
        }
        
        for old, new in punctuation_map.items():
            text = text.replace(old, new)
            
        return text

    def get_processing_statistics(self, text: str) -> dict:
        """
        获取文本处理统计信息
        
        Args:
            text: 输入文本
            
        Returns:
            dict: 统计信息
        """
        if not text:
            return {
                "total_chars": 0,
                "total_lines": 0,
                "non_empty_lines": 0,
                "error_chars_enabled": config.enable_char_errors,
                "estimated_error_replacements": 0
            }

        lines = text.split('\n')
        non_empty_lines = [line for line in lines if line.strip()]
        
        stats = {
            "total_chars": len(text),
            "total_lines": len(lines),
            "non_empty_lines": len(non_empty_lines),
            "error_chars_enabled": config.enable_char_errors,
            "estimated_error_replacements": 0
        }

        # 如果启用了错别字处理，获取估计的替换数量
        if config.enable_char_errors:
            try:
                if not self.error_processor:
                    self._init_error_processor()
                
                if self.error_processor:
                    error_stats = self.error_processor.get_statistics(
                        text, config.char_error_intensity
                    )
                    stats["estimated_error_replacements"] = error_stats["estimated_replacements"]
                    stats["replaceable_chars"] = error_stats["replaceable_chars"]
            except Exception as e:
                print(f"获取错别字统计失败: {e}")

        return stats

    def preview_processing(self, text: str, max_length: int = 200) -> dict:
        """
        预览文本处理效果（不修改原文本）
        
        Args:
            text: 输入文本
            max_length: 预览文本的最大长度
            
        Returns:
            dict: 包含原文本和处理后文本的预览
        """
        if not text:
            return {
                "original": "",
                "processed": "",
                "truncated": False
            }

        # 截取预览长度
        preview_text = text[:max_length] if len(text) > max_length else text
        truncated = len(text) > max_length

        # 处理预览文本
        processed_text = self.process_text_content(preview_text)

        return {
            "original": preview_text,
            "processed": processed_text,
            "truncated": truncated
        }


# 创建全局文本处理器实例
text_processor = TextProcessor()


# 兼容旧接口的函数
def process_text_content(text: str) -> str:
    """
    处理文本内容（兼容旧接口）
    
    Args:
        text: 输入文本
        
    Returns:
        str: 处理后的文本
    """
    return text_processor.process_text_content(text)


def replace_periods(text: str) -> str:
    """
    替换句号为逗号（兼容旧接口）
    
    Args:
        text: 输入文本
        
    Returns:
        str: 处理后的文本
    """
    return text_processor.replace_periods(text)


def reverse_text_order(content: str) -> str:
    """
    反转文本顺序（兼容旧接口）
    
    Args:
        content: 输入文本
        
    Returns:
        str: 反转后的文本
    """
    return text_processor.reverse_text_order(content)


def apply_char_errors(text: str) -> str:
    """
    应用错别字处理（兼容旧接口）
    
    Args:
        text: 输入文本
        
    Returns:
        str: 处理后的文本
    """
    return text_processor.apply_char_errors(text)