import hashlib import os from typing import List, Tuple def get_file_hash(file_path: str) -> str: """Calculate MD5 hash of a file""" hash_md5 = hashlib.md5() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def detect_encoding(file_path: str) -> str: """Detect file encoding""" # Try to import chardet, if not available use fallback chardet = None try: import chardet as chardet_module chardet = chardet_module except ImportError: pass if chardet: with open(file_path, "rb") as f: raw_data = f.read() result = chardet.detect(raw_data) encoding = result["encoding"] # Priority: UTF-8 > GBK > others if encoding and encoding.lower().startswith("utf-8"): return "utf-8" elif encoding and encoding.lower() == "gbk": return "gbk" else: return encoding else: # Fallback if chardet is not available return "utf-8" def read_text_file(file_path: str) -> str: """Read text file with automatic encoding detection""" try: encoding = detect_encoding(file_path) if not encoding: raise ValueError("无法识别文件编码") with open(file_path, "r", encoding=encoding) as f: return f.read() except UnicodeDecodeError: # Fallback to utf-8 with error handling with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read() except Exception as e: raise ValueError(f"文件读取失败: {str(e)}") def split_into_paragraphs(text: str) -> List[str]: """Split text into paragraphs with improved logic""" import re # 首先尝试按标题分割 # 查找标题模式(以数字或汉字开头,后跟顿号、点或空格) title_pattern = r'(?:^|\n)([一二三四五六七八九十\d]+[\.、 ]+.*?)(?=\n[一二三四五六七八九十\d]+[\.、 ]+|$)' title_matches = re.findall(title_pattern, text) if title_matches: # 按标题分割 sections = re.split(title_pattern, text) # 移除空字符串并重新组织 sections = [s.strip() for s in sections if s.strip()] return sections # 然后尝试按双换行符分割(段落) paragraphs = text.split("\n\n") # 如果段落太少,尝试按句子分割 if len(paragraphs) <= 2: # 使用更复杂的句子分割逻辑 sentence_endings = r'[。!?.!?;;]' sentences = re.split(sentence_endings, text) sentences = [s.strip() for s in sentences if s.strip()] # 组合短句子 combined_sentences = [] current_sentence = "" for sentence in sentences: if not sentence: continue # 添加标点符号(除了最后一个句子) if sentence != sentences[-1]: sentence += text[text.find(sentence) + len(sentence)] if text.find(sentence) + len(sentence) < len(text) else "。" current_sentence += sentence # 如果当前句子足够长或者遇到明显的句子结束符 if len(current_sentence) > 50 or re.search(r'[。!?.!?]$', sentence): combined_sentences.append(current_sentence) current_sentence = "" # 添加剩余的句子 if current_sentence: combined_sentences.append(current_sentence) return combined_sentences # 过滤空段落 return [p.strip() for p in paragraphs if p.strip()] def clean_text(text: str) -> str: """Clean text by removing noise""" lines = text.split("\n") cleaned_lines = [] # Remove header/footer patterns skip_patterns = [r"第\d+页", r"^\s*$"] import re for line in lines: # Skip lines with header/footer patterns if any(re.search(pattern, line) for pattern in skip_patterns): continue # Skip lines with乱码 (garbled text) if "锘" in line or "???" in line: continue cleaned_lines.append(line) # Remove consecutive duplicate lines (3 or more) final_lines = [] prev_line = "" repeat_count = 0 for line in cleaned_lines: if line == prev_line: repeat_count += 1 if repeat_count < 3: # Keep up to 2 duplicates final_lines.append(line) else: repeat_count = 0 final_lines.append(line) prev_line = line return "\n".join(final_lines)