diff --git a/batch_processor.py b/batch_processor.py index 6b38f89..1e0615d 100644 --- a/batch_processor.py +++ b/batch_processor.py @@ -129,11 +129,18 @@ class BatchProcessor: # 生成DOCX def update_file_progress(progress: int, text: str): if progress_callback: + # 优化进度计算,避免浮点数精度问题 + # 确保当前文件进度在0-100范围内 + file_progress = max(0, min(100, progress)) + # 计算整体进度:当前文件的进度在总进度中的占比 - file_weight = 1.0 / total_count - current_file_progress = current_index + (progress / 100.0) - overall_progress = int((current_file_progress / total_count) * 100) - progress_callback(overall_progress, f"{pair['txt']['name']}: {text}") + if total_count > 0: + # 使用整数运算避免浮点数精度问题 + overall_progress = (current_index * 100 + file_progress) // total_count + overall_progress = max(0, min(100, overall_progress)) + progress_callback(int(overall_progress), f"{pair['txt']['name']}: {text}") + else: + progress_callback(file_progress, f"{pair['txt']['name']}: {text}") success = self.docx_generator.generate(sections, image_files, output_path, update_file_progress) @@ -148,7 +155,7 @@ class BatchProcessor: return result def validate_batch_input(self, txt_folder: str, images_root: str, - output_root: str = None) -> Dict[str, Any]: + output_root: Optional[str] = None) -> Dict[str, Any]: """ 验证批量处理的输入参数 diff --git a/config.json b/config.json deleted file mode 100644 index 97f47ad..0000000 --- a/config.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "text_order_conversion": false, - "typo_handling": false, - "punctuation_replacement": false, - "paragraph_formatting": false, - "paragraph_min_length": 100, - "paragraph_max_length": 300, - "typo_intensity": 0.5, - "custom_punctuation": ",。!?;?!;", - "output_path": "", - "use_same_folder": true, - "last_txt_folder": "", - "last_images_root": "", - "last_output_root": "" -} \ No newline at end of file diff --git a/docx_generator.py b/docx_generator.py index 24f4c15..e8673b2 100644 --- a/docx_generator.py +++ b/docx_generator.py @@ -226,49 +226,19 @@ class DocxGenerator: para = doc.add_paragraph(style='List Bullet') self._apply_inline_formatting(para, content) # 应用列表样式 - if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.unordered_list: - list_style = self.current_document_style.unordered_list - if list_style.paragraph: - if list_style.paragraph.space_before > 0: - para.paragraph_format.space_before = Pt(list_style.paragraph.space_before) - if list_style.paragraph.space_after > 0: - para.paragraph_format.space_after = Pt(list_style.paragraph.space_after) + self._apply_list_style(para, 'unordered') elif element_type == 'ordered_list': para = doc.add_paragraph(style='List Number') self._apply_inline_formatting(para, content) # 应用列表样式 - if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.ordered_list: - list_style = self.current_document_style.ordered_list - if list_style.paragraph: - if list_style.paragraph.space_before > 0: - para.paragraph_format.space_before = Pt(list_style.paragraph.space_before) - if list_style.paragraph.space_after > 0: - para.paragraph_format.space_after = Pt(list_style.paragraph.space_after) + self._apply_list_style(para, 'ordered') elif element_type == 'blockquote': para = doc.add_paragraph(style='Quote') self._apply_inline_formatting(para, content) # 应用引用样式 - if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block: - quote_style = self.current_document_style.quote_block - if quote_style.paragraph: - if quote_style.paragraph.line_spacing > 0: - para.paragraph_format.line_spacing = quote_style.paragraph.line_spacing - if quote_style.paragraph.space_before > 0: - para.paragraph_format.space_before = Pt(quote_style.paragraph.space_before) - if quote_style.paragraph.space_after > 0: - para.paragraph_format.space_after = Pt(quote_style.paragraph.space_after) - if quote_style.paragraph.first_line_indent > 0: - para.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12) - - # 设置对齐方式 - if quote_style.paragraph.alignment == "center": - para.alignment = WD_ALIGN_PARAGRAPH.CENTER - elif quote_style.paragraph.alignment == "right": - para.alignment = WD_ALIGN_PARAGRAPH.RIGHT - elif quote_style.paragraph.alignment == "justify": - para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + self._apply_quote_style(para) elif element_type == 'code_block': self._add_code_block(doc, element.get('content', ''), element.get('language', '')) @@ -282,6 +252,58 @@ class DocxGenerator: elif element_type == 'empty': doc.add_paragraph() + def _apply_list_style(self, paragraph, list_type: str) -> None: + """ + 应用列表样式到段落 + + Args: + paragraph: DOCX段落对象 + list_type: 列表类型 ('unordered' 或 'ordered') + """ + if not (hasattr(self, 'current_document_style') and self.current_document_style): + return + + list_style = None + if list_type == 'unordered' and self.current_document_style.unordered_list: + list_style = self.current_document_style.unordered_list + elif list_type == 'ordered' and self.current_document_style.ordered_list: + list_style = self.current_document_style.ordered_list + + if list_style and list_style.paragraph: + if list_style.paragraph.space_before > 0: + paragraph.paragraph_format.space_before = Pt(list_style.paragraph.space_before) + if list_style.paragraph.space_after > 0: + paragraph.paragraph_format.space_after = Pt(list_style.paragraph.space_after) + + def _apply_quote_style(self, paragraph) -> None: + """ + 应用引用块样式到段落 + + Args: + paragraph: DOCX段落对象 + """ + if not (hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block): + return + + quote_style = self.current_document_style.quote_block + if quote_style.paragraph: + if quote_style.paragraph.line_spacing > 0: + paragraph.paragraph_format.line_spacing = quote_style.paragraph.line_spacing + if quote_style.paragraph.space_before > 0: + paragraph.paragraph_format.space_before = Pt(quote_style.paragraph.space_before) + if quote_style.paragraph.space_after > 0: + paragraph.paragraph_format.space_after = Pt(quote_style.paragraph.space_after) + if quote_style.paragraph.first_line_indent > 0: + paragraph.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12) + + # 设置对齐方式 + if quote_style.paragraph.alignment == "center": + paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER + elif quote_style.paragraph.alignment == "right": + paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT + elif quote_style.paragraph.alignment == "justify": + paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY + def _add_formatted_paragraph(self, doc: DocxDocument, content: str) -> None: """ 添加带格式的段落 @@ -551,20 +573,23 @@ class DocxGenerator: output_path: 输出文件路径(用于临时文件) """ try: - # 处理图片 - img, width = ImageProcessor.process_image(image_path) + # 使用优化方法处理图片 + temp_dir = os.path.dirname(output_path) + optimized_image_path = ImageProcessor.optimize_image_for_docx(image_path, temp_dir) + + # 处理图片(方向修正和尺寸调整) + img, width = ImageProcessor.process_image(optimized_image_path) temp_img_path = None if config.image_resize == "width": # 需要保存临时图片 - temp_dir = os.path.dirname(output_path) os.makedirs(temp_dir, exist_ok=True) temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png") img.save(temp_img_path) self.temp_files.append(temp_img_path) img_path = temp_img_path else: - img_path = image_path + img_path = optimized_image_path if optimized_image_path != image_path else image_path # 创建段落并插入图片 para = doc.add_paragraph() diff --git a/file_handler.py b/file_handler.py index ba834a0..f647dfa 100644 --- a/file_handler.py +++ b/file_handler.py @@ -6,7 +6,7 @@ import os import glob -from typing import List, Dict, Any +from typing import List, Dict, Any, Optional from config import config @@ -67,22 +67,38 @@ class FileHandler: if not os.path.isdir(images_root): raise Exception(f"图片根文件夹不存在: {images_root}") - # 获取所有图片文件夹 - all_image_folders = [] - for root, dirs, _ in os.walk(images_root): - for dir_name in dirs: - folder_path = os.path.join(root, dir_name) - all_image_folders.append({ - "path": folder_path, - "name": dir_name, - "relative_path": os.path.relpath(folder_path, images_root) - }) - matched_pairs = [] + # 优化:直接在遍历过程中进行匹配,避免先获取所有文件夹 for txt in txt_files: - matches = FileHandler._find_matches_for_txt(txt, all_image_folders) + matches = [] + txt_name = txt["name"].lower() + # 遍历图片根目录下的所有子目录进行匹配 + for root, dirs, _ in os.walk(images_root): + for dir_name in dirs: + folder_path = os.path.join(root, dir_name) + folder_name = dir_name.lower() + + if config.match_pattern == "exact" and txt_name == folder_name: + matches.append({ + "path": folder_path, + "name": dir_name, + "relative_path": os.path.relpath(folder_path, images_root) + }) + elif config.match_pattern == "prefix" and folder_name.startswith(txt_name): + matches.append({ + "path": folder_path, + "name": dir_name, + "relative_path": os.path.relpath(folder_path, images_root) + }) + elif config.match_pattern == "contains" and txt_name in folder_name: + matches.append({ + "path": folder_path, + "name": dir_name, + "relative_path": os.path.relpath(folder_path, images_root) + }) + if matches: # 选择最短路径的匹配项 matches.sort(key=lambda x: len(x["relative_path"])) @@ -100,33 +116,6 @@ class FileHandler: return matched_pairs - @staticmethod - def _find_matches_for_txt(txt_info: Dict[str, str], image_folders: List[Dict[str, str]]) -> List[Dict[str, str]]: - """ - 为单个TXT文件查找匹配的图片文件夹 - - Args: - txt_info: TXT文件信息 - image_folders: 所有图片文件夹信息列表 - - Returns: - List[Dict[str, str]]: 匹配的图片文件夹列表 - """ - matches = [] - txt_name = txt_info["name"].lower() - - for img_folder in image_folders: - folder_name = img_folder["name"].lower() - - if config.match_pattern == "exact" and txt_name == folder_name: - matches.append(img_folder) - elif config.match_pattern == "prefix" and folder_name.startswith(txt_name): - matches.append(img_folder) - elif config.match_pattern == "contains" and txt_name in folder_name: - matches.append(img_folder) - - return matches - @staticmethod def get_image_files(folder_path: str) -> List[str]: """ @@ -142,17 +131,16 @@ class FileHandler: return [] image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif', '*.webp', '*.tiff'] - image_files = [] + image_files = set() # 使用集合去重 for ext in image_extensions: + # 优化:只进行一次glob搜索,同时匹配大小写 pattern = os.path.join(folder_path, ext) - image_files.extend(glob.glob(pattern)) - # 也检查大写扩展名 + image_files.update(glob.glob(pattern)) pattern_upper = os.path.join(folder_path, ext.upper()) - image_files.extend(glob.glob(pattern_upper)) + image_files.update(glob.glob(pattern_upper)) - # 去重(防止大小写扩展名重复) - image_files = list(set(image_files)) + image_files = list(image_files) # 转换回列表 # 根据配置排序 if config.image_sort_by == "name": @@ -234,7 +222,7 @@ class FileHandler: return output_path @staticmethod - def validate_paths(txt_folder: str, images_root: str, output_root: str = None) -> Dict[str, bool]: + def validate_paths(txt_folder: str, images_root: str, output_root: Optional[str] = None) -> Dict[str, bool]: """ 验证路径的有效性 diff --git a/gui_style_manager.py b/gui_style_manager.py index 6903d90..233a676 100644 --- a/gui_style_manager.py +++ b/gui_style_manager.py @@ -9,12 +9,9 @@ from tkinter import ttk, filedialog, messagebox, simpledialog from typing import Optional from style_manager import style_manager, DocumentStyle -from config import Config +from config import config as config_manager from advanced_style_editor import open_advanced_editor -# 创建配置管理器实例 -config_manager = Config() - def create_style_tab(parent): """创建样式管理选项卡""" @@ -41,7 +38,8 @@ def create_style_tab(parent): def on_style_change(*args): config_manager.current_style = style_var.get() - config_manager.save_to_file('config.json') + from config import CONFIG_FILE_PATH + config_manager.save_to_file(CONFIG_FILE_PATH) _update_style_info() style_var.trace('w', on_style_change) diff --git a/replacestr.py b/replacestr.py index d415703..6cc21ad 100644 --- a/replacestr.py +++ b/replacestr.py @@ -393,6 +393,10 @@ def replace_text(text): if len(sys.argv) > 1 and sys.argv[1] == 'test': run_tests() sys.exit(0) + + # 实际的文本处理逻辑 + processor = TextProcessor(min_length=30) + return processor.process_text(text) if __name__ == "__main__": # 命令行模式 diff --git a/style_manager.py b/style_manager.py index 7a53f8d..d0ade4f 100644 --- a/style_manager.py +++ b/style_manager.py @@ -174,6 +174,33 @@ class StyleManager: styles = {} # 1. 爆款文章风格 - 参考知乎、头条等平台 + styles["爆款文章风格"] = self._create_viral_style() + + # 2. 微信公众号风格 - 专业的新媒体排版 + styles["微信公众号风格"] = self._create_wechat_style() + + # 3. 知乎高赞回答风格 - 逻辑清晰,层次分明 + styles["知乎高赞回答风格"] = self._create_zhihu_style() + + # 4. 小红书笔记风格 - 清新文艺,少女心 + styles["小红书笔记风格"] = self._create_xiaohongshu_style() + + # 5. 今日头条新闻风格 - 信息量大,节奏紧凑 + styles["今日头条新闻风格"] = self._create_toutiao_style() + + # 6. B站UP主视频脚本风格 - 轻松活泼,年轻化 + styles["B站UP主视频脚本风格"] = self._create_bilibili_style() + + # 7. 企业微信群通知风格 - 正式严肃 + styles["企业微信群通知风格"] = self._create_enterprise_style() + + # 8. 情感鸡汤文风格 - 温暖治愈 + styles["情感鸡汤文风格"] = self._create_emotional_style() + + return styles + + def _create_viral_style(self) -> DocumentStyle: + """创建爆款文章风格""" viral_style = DocumentStyle( name="爆款文章风格", description="高阅读量爆款文章风格,层次分明,吸引眼球", @@ -223,9 +250,10 @@ class StyleManager: background_color="#F8F9FA" ) - styles["爆款文章风格"] = viral_style - - # 2. 微信公众号风格 - 专业的新媒体排版 + return viral_style + + def _create_wechat_style(self) -> DocumentStyle: + """创建微信公众号风格""" wechat_style = DocumentStyle( name="微信公众号风格", description="专业的微信公众号排版,阅读体验佳", @@ -267,9 +295,10 @@ class StyleManager: border=True ) - styles["微信公众号风格"] = wechat_style - - # 3. 知乎高赞回答风格 - 逻辑清晰,层次分明 + return wechat_style + + def _create_zhihu_style(self) -> DocumentStyle: + """创建知乎高赞回答风格""" zhihu_style = DocumentStyle( name="知乎高赞回答风格", description="逻辑清晰,层次分明,专业权威", @@ -300,9 +329,10 @@ class StyleManager: line_spacing=1.3, space_before=10, space_after=8 ) - styles["知乎高赞回答风格"] = zhihu_style - - # 4. 小红书笔记风格 - 清新文艺,少女心 + return zhihu_style + + def _create_xiaohongshu_style(self) -> DocumentStyle: + """创建小红书笔记风格""" xiaohongshu_style = DocumentStyle( name="小红书笔记风格", description="清新文艺,适合生活方式类内容", @@ -332,9 +362,10 @@ class StyleManager: line_spacing=1.3, space_before=8, space_after=6 ) - styles["小红书笔记风格"] = xiaohongshu_style - - # 5. 今日头条新闻风格 - 信息量大,节奏紧凑 + return xiaohongshu_style + + def _create_toutiao_style(self) -> DocumentStyle: + """创建今日头条新闻风格""" toutiao_style = DocumentStyle( name="今日头条新闻风格", description="信息密度高,节奏紧凑,突出重点", @@ -364,9 +395,10 @@ class StyleManager: line_spacing=1.3, space_before=8, space_after=6 ) - styles["今日头条新闻风格"] = toutiao_style - - # 6. B站UP主视频脚本风格 - 轻松活泼,年轻化 + return toutiao_style + + def _create_bilibili_style(self) -> DocumentStyle: + """创建B站UP主视频脚本风格""" bilibili_style = DocumentStyle( name="B站UP主视频脚本风格", description="轻松活泼,适合年轻受众,有趣有料", @@ -396,9 +428,10 @@ class StyleManager: line_spacing=1.3, space_before=8, space_after=6 ) - styles["B站UP主视频脚本风格"] = bilibili_style - - # 7. 企业微信群通知风格 - 正式严肃 + return bilibili_style + + def _create_enterprise_style(self) -> DocumentStyle: + """创建企业微信群通知风格""" enterprise_style = DocumentStyle( name="企业微信群通知风格", description="正式严肃,信息传达清晰,商务风格", @@ -423,9 +456,10 @@ class StyleManager: line_spacing=1.3, space_before=12, space_after=8 ) - styles["企业微信群通知风格"] = enterprise_style - - # 8. 情感鸡汤文风格 - 温暖治愈 + return enterprise_style + + def _create_emotional_style(self) -> DocumentStyle: + """创建情感鸡汤文风格""" emotional_style = DocumentStyle( name="情感鸡汤文风格", description="温暖治愈,情感丰富,適合心灵鸡汤类内容", @@ -460,9 +494,7 @@ class StyleManager: background_color="#FFF3E0" ) - styles["情感鸡汤文风格"] = emotional_style - - return styles + return emotional_style def _load_custom_styles(self) -> None: """加载自定义样式""" diff --git a/test.py b/test.py new file mode 100644 index 0000000..abaad74 --- /dev/null +++ b/test.py @@ -0,0 +1,438 @@ +import re +import random +from typing import List, Dict, Tuple, Optional +import jieba +import jieba.posseg as pseg + + +class EnhancedArticleRewriter: + """ + 增强版文章智能改写工具 + 实现更自然的句子变化,符合人工书写逻辑 + """ + + def __init__(self): + # 标点符号定义 + self.sentence_endings = ['。', '!', '?', '…'] + self.pause_marks = [',', ';', ':', '、'] + + # 句子长度分布(模拟人工书写习惯) + self.sentence_length_distribution = { + 'short': (5, 15), # 短句 + 'medium': (16, 30), # 中句 + 'long': (31, 50), # 长句 + 'extra_long': (51, 80) # 超长句 + } + + # 连接词库(更丰富的连接词) + self.connectors = { + 'sequence': ['随后', '接着', '然后', '紧接着', '继而', '进而'], + 'addition': ['并且', '同时', '此外', '另外', '再者', '况且', '而且'], + 'contrast': ['但是', '然而', '不过', '可是', '却', '反而', '相反'], + 'cause': ['因为', '由于', '因此', '所以', '故而', '从而'], + 'condition': ['如果', '假如', '倘若', '若是', '要是'], + 'concession': ['虽然', '尽管', '即使', '纵然', '固然'], + 'summary': ['总之', '综上', '总的来说', '概括地说', '简言之'], + 'example': ['比如', '例如', '譬如', '好比', '正如'], + 'emphasis': ['特别是', '尤其是', '更重要的是', '值得注意的是'], + 'explanation': ['也就是说', '换句话说', '具体来说', '准确地说'] + } + + # 句式模板 + self.sentence_patterns = { + 'statement': ['{}'], # 陈述句 + 'emphasis_front': ['值得注意的是,{}', '需要强调的是,{}', '重要的是,{}'], + 'emphasis_back': ['{},这一点尤为重要', '{},这是关键所在'], + 'question_rhetorical': ['难道不是{}吗?', '{},不是吗?'], + 'parallel': ['不仅{},而且{}', '既{},又{}', '一方面{},另一方面{}'], + 'progressive': ['先是{},然后{}', '从{}到{}', '由{}发展到{}'] + } + + # 同义词/近义词替换库 + self.synonyms = { + '发展': ['演进', '进步', '演变', '发展', '进化', '提升', '推进'], + '改变': ['变化', '转变', '改变', '变革', '转换', '调整', '革新'], + '重要': ['关键', '重要', '核心', '主要', '根本', '要紧', '关键性'], + '影响': ['作用', '影响', '效应', '冲击', '波及', '涉及'], + '提高': ['提升', '增强', '改善', '优化', '加强', '增进'], + '显示': ['表明', '显示', '说明', '揭示', '体现', '反映', '展现'], + '通过': ['利用', '运用', '借助', '凭借', '依靠', '经由'], + '实现': ['达成', '实现', '完成', '达到', '做到', '落实'], + '问题': ['难题', '问题', '挑战', '困难', '障碍', '瓶颈'], + '方法': ['方式', '手段', '途径', '办法', '策略', '措施'], + '需要': ['需要', '要求', '必须', '应该', '亟需', '急需'], + '能够': ['能够', '可以', '能', '可', '得以', '足以'], + '非常': ['十分', '相当', '特别', '格外', '极其', '异常', '颇为'], + '很多': ['许多', '大量', '众多', '诸多', '不少', '大批'], + '所有': ['全部', '一切', '所有', '整个', '全体', '各个'], + '已经': ['已', '已经', '业已', '早已', '都已'], + '正在': ['正', '正在', '在', '正处于', '目前正'], + '越来越': ['日益', '愈发', '愈加', '更加', '日渐', '渐渐'], + '不断': ['持续', '不断', '连续', '陆续', '继续', '频繁'], + '各种': ['各类', '各种', '多种', '种种', '诸般', '多样'], + } + + def _get_random_sentence_length_type(self) -> str: + """根据正态分布随机选择句子长度类型""" + # 模拟人工书写的句长分布:中句最多,短句和长句次之,超长句最少 + weights = {'short': 25, 'medium': 40, 'long': 25, 'extra_long': 10} + types = list(weights.keys()) + probs = [weights[t] / 100 for t in types] + return random.choices(types, weights=probs)[0] + + def _smart_split_merge_sentences(self, sentences: List[str]) -> List[str]: + """智能拆分和合并句子,创造自然的长短句节奏""" + if not sentences: + return sentences + + result = [] + i = 0 + + while i < len(sentences): + # 获取目标句长类型 + target_type = self._get_random_sentence_length_type() + min_len, max_len = self.sentence_length_distribution[target_type] + + current_sentence = sentences[i].strip() + current_len = len(current_sentence) + + # 如果当前句子太长,尝试拆分 + if current_len > max_len: + split_sentences = self._split_sentence_naturally(current_sentence, max_len) + result.extend(split_sentences) + + # 如果当前句子太短,尝试与下一句合并 + elif current_len < min_len and i + 1 < len(sentences): + # 30%概率合并短句 + if random.random() < 0.3: + merged = self._merge_sentences(current_sentence, sentences[i + 1]) + result.append(merged) + i += 1 # 跳过下一句 + else: + result.append(current_sentence) + + # 长度合适,直接添加 + else: + result.append(current_sentence) + + i += 1 + + return result + + def _split_sentence_naturally(self, sentence: str, max_length: int) -> List[str]: + """自然地拆分长句""" + if len(sentence) <= max_length: + return [sentence] + + # 保存句尾标点 + ending = '' + for mark in self.sentence_endings: + if sentence.endswith(mark): + ending = mark + sentence = sentence[:-len(mark)] + break + + # 优先在逗号处拆分 + parts = [] + if ',' in sentence: + segments = sentence.split(',') + current = "" + + for i, segment in enumerate(segments): + if not current: + current = segment + elif len(current + ',' + segment) <= max_length: + current += ',' + segment + else: + # 添加句号使其成为完整句子 + if random.random() < 0.7: # 70%概率添加句号 + parts.append(current + '。') + else: # 30%概率使用其他句尾 + parts.append(current + random.choice(['。', '!', ''])) + current = segment + + # 处理最后一部分 + if current: + parts.append(current + ending) + else: + # 如果没有逗号,尝试在其他标点处拆分 + parts = [sentence + ending] + + return parts if parts else [sentence + ending] + + def _merge_sentences(self, sent1: str, sent2: str) -> str: + """智能合并两个句子""" + # 移除第一个句子的句尾标点 + for mark in self.sentence_endings: + if sent1.endswith(mark): + sent1 = sent1[:-len(mark)] + break + + # 选择连接方式 + merge_type = random.choice(['comma', 'connector', 'semicolon']) + + if merge_type == 'comma': + return sent1 + ',' + sent2 + elif merge_type == 'connector': + # 随机选择连接词类型 + conn_type = random.choice(list(self.connectors.keys())) + connector = random.choice(self.connectors[conn_type]) + return sent1 + ',' + connector + sent2 + else: # semicolon + return sent1 + ';' + sent2 + + def _replace_synonyms(self, text: str, intensity: float) -> str: + """同义词替换""" + words = list(jieba.cut(text)) + result = [] + + for word in words: + if word in self.synonyms and random.random() < intensity: + # 选择一个同义词(避免选到原词) + alternatives = [w for w in self.synonyms[word] if w != word] + if alternatives: + result.append(random.choice(alternatives)) + else: + result.append(word) + else: + result.append(word) + + return ''.join(result) + + def _adjust_sentence_structure(self, sentence: str, intensity: float) -> str: + """调整句子结构,使其更自然""" + if random.random() > intensity: + return sentence + + # 保存句尾标点 + ending = '' + for mark in self.sentence_endings: + if sentence.endswith(mark): + ending = mark + sentence = sentence[:-len(mark)] + break + + # 随机选择调整方式 + adjust_type = random.choice(['reorder', 'add_emphasis', 'change_pattern']) + + if adjust_type == 'reorder' and ',' in sentence: + # 重新排列子句 + parts = sentence.split(',') + if len(parts) >= 2: + # 智能重排:不是完全随机,而是有逻辑的调整 + if len(parts) == 2: + # 两个子句直接交换 + sentence = parts[1] + ',' + parts[0] + else: + # 多个子句,将中间的提前或延后 + mid_idx = len(parts) // 2 + if random.random() < 0.5: + # 中间提前 + parts = [parts[mid_idx]] + parts[:mid_idx] + parts[mid_idx + 1:] + else: + # 中间延后 + parts = parts[:mid_idx] + parts[mid_idx + 1:] + [parts[mid_idx]] + sentence = ','.join(parts) + + elif adjust_type == 'add_emphasis': + # 添加强调 + if random.random() < 0.3: + pattern = random.choice(self.sentence_patterns['emphasis_front']) + sentence = pattern.format(sentence) + elif random.random() < 0.3: + pattern = random.choice(self.sentence_patterns['emphasis_back']) + sentence = pattern.format(sentence) + + elif adjust_type == 'change_pattern': + # 改变句式 + if ',' in sentence and random.random() < 0.4: + parts = sentence.split(',', 1) + if len(parts) == 2: + # 使用并列或递进句式 + if random.random() < 0.5: + sentence = f"不仅{parts[0]},而且{parts[1]}" + else: + sentence = f"{parts[0]},进而{parts[1]}" + + return sentence + ending + + def _add_natural_variations(self, sentence: str, intensity: float) -> str: + """添加自然的语言变化""" + if random.random() > intensity: + return sentence + + variations = [] + + # 20%概率添加过渡词 + if random.random() < 0.2: + transition = random.choice(['其实', '事实上', '实际上', '确实', '显然']) + variations.append(f"{transition},{sentence}") + + # 15%概率添加程度副词 + if random.random() < 0.15: + adverb = random.choice(['更', '更加', '尤其', '特别', '格外']) + # 简单地在"是"、"有"、"能"等词前添加副词 + for verb in ['是', '有', '能', '会', '要']: + if verb in sentence: + sentence = sentence.replace(verb, f"{adverb}{verb}", 1) + break + + return variations[0] if variations else sentence + + def rewrite(self, text: str, config: Dict = None) -> str: + """ + 主函数:改写文章 + + 参数: + text: 输入文章 + config: 配置字典 + - intensity: 修改强度 0.0-1.0 + - preserve_meaning: 是否保持原意 + - natural_flow: 是否保持自然流畅 + - vary_sentence_length: 是否变化句长 + """ + if config is None: + config = {} + + # 设置默认值 + config.setdefault('intensity', 0.6) + config.setdefault('preserve_meaning', True) + config.setdefault('natural_flow', True) + config.setdefault('vary_sentence_length', True) + + intensity = config['intensity'] + + # 分段处理 + paragraphs = [p.strip() for p in text.split('\n') if p.strip()] + result_paragraphs = [] + + for para in paragraphs: + # 分句 + sentences = self._split_sentences(para) + + # 1. 首先进行句子长短调整 + if config['vary_sentence_length']: + sentences = self._smart_split_merge_sentences(sentences) + + # 2. 处理每个句子 + processed_sentences = [] + for i, sent in enumerate(sentences): + # 同义词替换 + sent = self._replace_synonyms(sent, intensity * 0.5) + + # 句子结构调整 + sent = self._adjust_sentence_structure(sent, intensity * 0.7) + + # 添加自然变化 + sent = self._add_natural_variations(sent, intensity * 0.3) + + processed_sentences.append(sent) + + # 3. 段落重组(偶尔调整句子顺序) + if len(processed_sentences) > 3 and random.random() < intensity * 0.2: + # 20%概率微调句子顺序(只交换相邻句子) + idx = random.randint(0, len(processed_sentences) - 2) + processed_sentences[idx], processed_sentences[idx + 1] = \ + processed_sentences[idx + 1], processed_sentences[idx] + + result_paragraphs.append(''.join(processed_sentences)) + + return '\n\n'.join(result_paragraphs) + + def _split_sentences(self, text: str) -> List[str]: + """改进的句子分割""" + # 处理多种句尾标点 + pattern = '([。!?…]+)' + parts = re.split(pattern, text) + + sentences = [] + for i in range(0, len(parts) - 1, 2): + if parts[i].strip(): + sentences.append(parts[i] + parts[i + 1]) + + # 处理最后一个部分 + if len(parts) % 2 == 1 and parts[-1].strip(): + sentences.append(parts[-1] + '。') # 添加默认句号 + + return sentences + + +def demo(): + """使用示例""" + sample_text = """ +最近,晓蕾又上热搜了! + +咋回事呢?原来,她和老公刘剑一起开了直播带货的副业。但特意声明:她早就离开了上海电视台的编制,也不拿电视台的工资。换句话说,现在卖东西,完全是私营业态。 + +这事儿一下子引爆了大家的八卦魂。毕竟,明星主持扎堆直播间,也不算新鲜事。但还是挺多人纳闷:这些当年的 "话筒头牌",是不是集体选择摆烂了? + +其实,晓蕾和刘剑干脆落落大方,在直播间直接回应了这点。俩人意思很明确:“我们不是来拉低职业口碑的”。而且还耐心解释了自己转行的理由。 +曾经的大佬,变成了烟火气 + +说到晓蕾,不了解点她背景都不好意思讨论人家。当年上视新闻部的 "当家花旦",光学历和气质,足够秒杀隔壁主持圈的八条街。而刘剑,早年可是 "台柱子",播音腔精致到令人耳膜怀孕。照理来说,这样一对,在编制铁饭碗里躺平一辈子没毛病。 + +可人家偏不。 + +晓蕾说过这样一句话:“其实,我就是个普通人。” 真的那么普通吗?她不这么说,没人敢忘了她的标杆履历啊!她离开台里后,居然一头扎进了童语言教育这个赛道,一干就是十年,让机构做到了业内小圈子的爆款水准。 + +而这次直播,打的商品也不混乱,主打性价比和实用属性,晓蕾每件商品还得亲测过。如果你觉得她自吹自擂,建议去看看她直播间的粉丝评论。大家的意思是:晓蕾推品 = ·9放心买。 +刘剑这枚 “前一哥”,更狠! + +说晓蕾牛,别忘了,刘剑十年前也上演了一场 “豪赌”。那个年代,辞去电视台稳定工作,和 “打水漂” 差不多。 + +可是刘剑敢把梭全下,为啥?因为他看中了播音考生和辅导课程的市场,那时还没有多少人扎堆干这块,他觉得这是个机会。 + +果然,就这么辞了职,工作的腰板从跟组织吃工资,摇身变成了名副其实的事业单位 —— 自己家老板。虽然后来也是磕磕绊绊,但终究从试验田里掘出了一片肥沃地。 +主持人的 “下海”,是换方向走 + +有人觉得,曾经的新闻人、主持人 “跑去带货”,肯定是混不下去了。你要放在十年前,这种联想不稀奇,可现在不一样了。大环境变了,传统媒体是真的在互联网时代被打败得找不到调。 + +原来电视频道的观众,现在早转移到手机端,看知乎、刷短视频,甚至晚上蹲个带货直播会。你说新闻节目的高冷主播,现在换脸做带货主持,是不是 “落魄”?未必。 + +晓蕾夫妻这一波,实际上是转型很成功的范例。不管带啥网红货,他们俩把品质第一的逻辑摆明白了。这样的主播,不止卖产品,更卖信誉,靠着时间积攒了观众的信任。 +直播间哪门子 LOW?明明是主战场 + +网友说得有趣:“谁嫌直播带货 LOW,谁就输定了。” 道理没跑儿,移动互联网成了咱生活重心,生意也跟着迁移。这是明显趋势,看不懂的还真不想赚钱了。 + +而且,做直播一点不轻松。站几个小时口播、随时照顾弹幕情绪,这比坐着念提词器辛苦多了。像晓蕾和刘剑这样的 “摸鱼资历”,能转过身来赚饭钱,这不是 “混”,是 “拼” 啊。 + +别说传统意义的职业崇拜消失殆尽,你觉得稳如狗的岗位,说散架就散。老一辈金饭碗情结,对于下一代新创别说香,而是种被淘汰跑赢速度内心创新积极点。 + +我不是电视台员工了,早就离职 10 年了。 +""" + + rewriter = EnhancedArticleRewriter() + + print("=" * 60) + print("原文:") + print("=" * 60) + print(sample_text) + + # 测试不同强度的改写 + for intensity in [0.3, 0.6, 0.9]: + print(f"\n{'=' * 60}") + print(f"改写强度: {intensity}") + print("=" * 60) + + config = { + 'intensity': intensity, + 'preserve_meaning': True, + 'natural_flow': True, + 'vary_sentence_length': True + } + + result = rewriter.rewrite(sample_text, config) + print(result) + + # 统计句子长度分布 + sentences = re.split('[。!?…]+', result) + lengths = [len(s) for s in sentences if s.strip()] + if lengths: + print(f"\n句子长度分布: 最短={min(lengths)}, 最长={max(lengths)}, 平均={sum(lengths) / len(lengths):.1f}") + print(f"句子数量: {len(lengths)}") + + +if __name__ == '__main__': + # 注意:需要安装jieba库 + # pip install jieba + demo() \ No newline at end of file diff --git a/test_1.py b/test_1.py new file mode 100644 index 0000000..3f2311d --- /dev/null +++ b/test_1.py @@ -0,0 +1,174 @@ +import re +import jieba +import random +from typing import List + + +class HeavyHumanizer: + """重度人类化改写器 - 保持逻辑,可读性高,适合绕过 AI 检测""" + + def __init__(self): + jieba.initialize() + # 人类化开头/转折/插话 + self.openings = ['说到', '提到', '关于', '其实', '要说', '你知道吗', '顺便说'] + self.transitions = ['但是', '不过', '然而', '话说回来', '可惜的是', '偏偏'] + self.fillers = ['其实', '当然', '显然', '我觉得', '说起来', '顺便说'] + # 主观表达 + self.subjective = ['我认为', '我觉得', '在我看来', '就我所知', '据我了解'] + # 口语词汇 + self.colloquial_particles = ['呢', '吧', '啊', '哦', '嘛', '哈', '呀'] + # 高风险词替换 + self.high_risk = { + '重要': ['关键', '核心', '主要'], + '显著': ['明显', '突出', '很大'], + '提升': ['提高', '增强', '改善'], + '确保': ['保证', '做到', '维护'], + '实施': ['执行', '开展', '推行'] + } + # 分句标点 + self.sentence_endings = {'。', '!', '?', '.', '!', '?', '…', ';', ';'} + + def split_sentences(self, text: str) -> List[str]: + """按照句子结束标点分割文本""" + sentences = [] + current = '' + for c in text: + current += c + if c in self.sentence_endings: + sentences.append(current.strip()) + current = '' + if current.strip(): + sentences.append(current.strip()) + return sentences + + def replace_high_risk_words(self, sentence: str) -> str: + """替换高风险 AI 词汇""" + for k, v_list in self.high_risk.items(): + if k in sentence and random.random() < 0.8: + sentence = sentence.replace(k, random.choice(v_list)) + return sentence + + def add_subjective_expressions(self, sentence: str) -> str: + """随机添加主观表达或口语词""" + if random.random() < 0.3: + expr = random.choice(self.subjective) + sentence = expr + ',' + sentence + if random.random() < 0.2: + particle = random.choice(self.colloquial_particles) + if sentence.endswith('。'): + sentence = sentence[:-1] + particle + '。' + if random.random() < 0.15: + filler = random.choice(self.fillers) + sentence = filler + ',' + sentence + return sentence + + def vary_sentence_length(self, sentences: List[str]) -> List[str]: + """打乱句子顺序或拆分长句,增加突发性""" + varied = [] + i = 0 + while i < len(sentences): + s = sentences[i] + # 适度拆分长句 + if len(s) > 50 and random.random() < 0.5: + mid = len(s) // 2 + # 找最近的逗号 + comma_pos = max(s.rfind(',', 0, mid), s.rfind(',', 0, mid)) + if comma_pos > 5: + first = s[:comma_pos + 1].strip() + second = s[comma_pos + 1:].strip() + varied.extend([first, second]) + i += 1 + continue + varied.append(s) + i += 1 + # 打乱顺序但保持逻辑块 + if random.random() < 0.3: + random.shuffle(varied) + return varied + + def create_paragraphs(self, sentences: List[str]) -> List[str]: + """根据句长和随机概率生成段落""" + paragraphs = [] + current = [] + current_len = 0 + for s in sentences: + current.append(s) + current_len += len(s) + if current_len > 80 and (random.random() < 0.4 or current_len > 150): + paragraphs.append(''.join(current)) + current = [] + current_len = 0 + if current: + paragraphs.append(''.join(current)) + return paragraphs + + def humanize_text(self, text: str) -> str: + """核心人类化改写函数""" + # 清理文本 + text = re.sub(r'\s+', '', text) + sentences = self.split_sentences(text) + + # 高风险词替换 + sentences = [self.replace_high_risk_words(s) for s in sentences] + + # 添加主观表达、口语词、插话 + sentences = [self.add_subjective_expressions(s) for s in sentences] + + # 增加长短句变化和突发性 + sentences = self.vary_sentence_length(sentences) + + # 创建段落 + paragraphs = self.create_paragraphs(sentences) + + # 格式化段落 + formatted = '\n\n'.join(['  ' + p for p in paragraphs]) + return formatted + + +# ================== 使用示例 ================== +if __name__ == "__main__": + text = """ +  最近,晓蕾又上热搜了! + +  换句话说,现在卖东西,完全是私营业态。咋回事呢?但特意声明:她早就离开了上海电视台的编制,也不拿电视台的工资欸。原来,她和老公刘剑一起开了直播带货的副业。 + +  这事儿一下子引爆了大家的八卦魂。毕竟,明星主持扎堆直播间,也不算新鲜事你说呢。 + +  但还是挺多人纳闷:这些当年的 "话筒头牌",是不是集体选择摆烂了? + +  其实,晓蕾和刘剑干…俩人意思很明确:“我们不是来拉低职业口碑的”。而且还耐心解释了自己转行的理由,曾经的大佬,变成了烟火气。 + +  说到晓蕾,不了解点她背景都不好意思讨论人家。当年上视新闻部的 "当家花旦",光学历和气质,足够秒杀隔壁主持圈的八条街。而刘剑,似乎早年可是 "台柱子",播音腔精致到令人耳膜怀孕嗯。照理来说,这样一对,在编制铁饭碗里躺平一辈子没毛病。 + +  可人家偏不。 + +  晓蕾说过这样一句话:“其实,我就是个普通人。” 真的那么普通吗?她不这么说,没人敢忘了她的标杆履历啊!她离开台里后,居然一头扎进了童语言教育这个赛道,一干就是十年,让机构做到了业内小圈子的爆款水准。 + +  而这次直播,打的商品也不混乱,主打性价比和实用属性,晓蕾每件商品还得亲测过。如果你觉得她自吹自擂,建议去看看她直播间的粉丝评论。大家地意思是:晓蕾推品 = 放心买。 + +  刘剑这枚 “前一哥”,更狠! + +  说晓蕾牛,看起来别忘了,刘剑十年前也上演了一场 “豪赌”。那个年代,辞去电视台稳定工作,和 “打水漂” 差不多。 + +  可是刘剑敢把梭全下,为啥?因为他看中了播音考生和辅导课程的市场,那时还没有多少人扎堆干这块,他觉得这是个机会。 + +  果然,就这么辞了职,工作的腰板从跟组织吃工资,摇身变成了名副其实的事业单位 —— 自己家老板。虽然后来也是磕磕绊绊,但终究从试验田里掘出了一片肥沃地。主持人的 “下海”,是换方向走。 + +  有人觉得,曾经的新闻人、主持人 “跑去带货”,肯定是混不下去了。你要放在十年前,这种联想不稀奇,可现在不一样了。大环境变了,看起来传统媒体是真的在互联网时代被打败得找不到调。 + +  原来电视频道的观众,现在早转移到手机端,看知乎、刷短视频,甚至晚上蹲个带货直播会。就像我说的,我认为,你说新闻节目的高冷主播,现在换脸做带货主持,是不是 “落魄”?未必。 + +  其实,晓蕾夫妻这一波,实际上是转型很成功的范例。不管带啥网红货,可能他们俩把品质第一的逻辑摆明白了啊。这样的主播,不止卖产品,更卖信誉,靠着时间积攒了观众的信任嗯。也许,直播间哪门子 LOW?明明是主战场。 + +  网友说得有趣:“谁嫌直播带货 LOW,谁就输定了。” 道理没跑儿,似乎移动互联网成了咱生活重心,生意也跟着迁移啊。 + +  这是明显趋势,看不懂的还真不想赚钱了。 + +  而且,似乎做直播一点不轻松。站几个小时口播、随时照顾弹幕情绪,这比坐着念提词器辛苦多了。其实,像晓蕾和刘剑这样的 “摸鱼资历”,能转过身来赚饭钱,这不是 “混”,是 “拼” 啊。 + +  别说传统意义的职业崇拜消失殆尽,你觉得稳如狗的岗位,说散架就散你说呢。老一辈金饭碗情结,对于下一代新创别说香,而是种被淘汰跑赢速度内心创新积极点。 + +  我不是电视台员工了,早就离职 10 年了。 """ + humanizer = HeavyHumanizer() + result = humanizer.humanize_text(text) + print(result) diff --git a/test_chinese_splitting.py b/test_chinese_splitting.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_image_splitting.py b/test_image_splitting.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_image_text_splitting.py b/test_image_text_splitting.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_image_text_splitting_debug.py b/test_image_text_splitting_debug.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_image_with_text_splitting.py b/test_image_with_text_splitting.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_output/test_image_text_splitting.docx b/test_output/test_image_text_splitting.docx deleted file mode 100644 index c3fd10e..0000000 Binary files a/test_output/test_image_text_splitting.docx and /dev/null differ diff --git a/test_output/test_image_text_splitting_debug.docx b/test_output/test_image_text_splitting_debug.docx deleted file mode 100644 index 92f0f48..0000000 Binary files a/test_output/test_image_text_splitting_debug.docx and /dev/null differ diff --git a/test_output/test_image_with_text_splitting.docx b/test_output/test_image_with_text_splitting.docx deleted file mode 100644 index 5b8beab..0000000 Binary files a/test_output/test_image_with_text_splitting.docx and /dev/null differ diff --git a/test_segment.txt b/test_segment.txt deleted file mode 100644 index 489423b..0000000 --- a/test_segment.txt +++ /dev/null @@ -1 +0,0 @@ -这是一个测试文本。它包含多个句子。每个句子都很短。但是我们需要测试分段排版功能。当文本长度超过最小段落长度时。应该被分割成多个段落。这样可以提高文档的可读性。让内容更加清晰易懂。 \ No newline at end of file diff --git a/test_segment_function.py b/test_segment_function.py deleted file mode 100644 index 8c7c350..0000000 --- a/test_segment_function.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python3 -"""测试分段排版功能""" - -import sys -import os -sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) - -from text_splitter import TextSplitter - -# 测试文本 -test_text = """这是一个测试文本。它包含多个句子。每个句子都很短。但是我们需要测试分段排版功能。 -当文本长度超过最小段落长度时。应该被分割成多个段落。这样可以提高文档的可读性。 -让内容更加清晰易懂。""" - -def test_text_splitting(): - print("=== 测试分段排版功能 ===") - print(f"原始文本长度: {len(test_text)} 字符") - print(f"原始文本: {test_text}") - print() - - # 创建分段器 - splitter = TextSplitter(min_length=50, max_length=200) - - # 分段处理 - paragraphs = splitter.split_text(test_text) - - print(f"分段结果 ({len(paragraphs)} 个段落):") - for i, paragraph in enumerate(paragraphs, 1): - print(f"段落 {i} ({len(paragraph)} 字符): {paragraph}") - - print() - print("=== 测试完成 ===") - -if __name__ == "__main__": - test_text_splitting() \ No newline at end of file diff --git a/test_split_behavior.py b/test_split_behavior.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_text_processor_splitting.py b/test_text_processor_splitting.py deleted file mode 100644 index e69de29..0000000 diff --git a/test_text_splitter.py b/test_text_splitter.py deleted file mode 100644 index e69de29..0000000 diff --git a/text_splitter.py b/text_splitter.py deleted file mode 100644 index 400178e..0000000 --- a/text_splitter.py +++ /dev/null @@ -1,140 +0,0 @@ -import re - -class TextSplitter: - def __init__(self, min_length=100, max_length=300): - """ - 初始化文本分段器 - :param min_length: 目标段落最小长度 - :param max_length: 目标段落最大长度 - """ - self.min_length = min_length - self.max_length = max_length - # 匹配标点符号的正则表达式,作为分段点(中文和英文标点) - # 这些标点符号通常表示一个完整句子的结束 - self.sentence_ending_punct = re.compile(r'([。?!.!?])') - - def split_text(self, text): - """ - 将文本分割成符合长度要求的段落,仅使用标点符号分割 - :param text: 待分割的原始文本 - :return: 分割后的段落列表 - """ - if not text: - return [] - - # 自动判断原始文本长度 - original_length = len(text) - print(f"原始文本长度: {original_length} 字符") - - # 如果原始文本长度小于最小长度,直接返回 - if original_length <= self.min_length: - return [text.strip()] - - # 将文本分割成完整句子(保留标点符号) - parts = self.sentence_ending_punct.split(text) - sentences = [] - - # 重组句子,确保标点符号与前面的文本在一起 - for i in range(0, len(parts)-1, 2): - sentence = (parts[i] + parts[i+1]).strip() - if sentence: # 跳过空句子 - sentences.append(sentence) - - # 如果没有找到任何标点符号,将整个文本作为一个段落 - if not sentences: - return [text.strip()] - - # 合并句子形成段落,确保在长度范围内 - paragraphs = [] - current_paragraph = "" - - for sentence in sentences: - # 尝试添加当前句子 - temp = current_paragraph + (" " if current_paragraph else "") + sentence - - # 检查添加后是否超出最大长度 - if len(temp) > self.max_length: - # 如果当前段落不为空,先保存当前段落 - if current_paragraph: - paragraphs.append(current_paragraph) - current_paragraph = sentence - else: - # 如果单个句子就超过最大长度,也必须接受(避免分割句子) - paragraphs.append(sentence) - current_paragraph = "" - else: - current_paragraph = temp - - # 添加最后一个段落 - if current_paragraph: - paragraphs.append(current_paragraph) - - # 检查是否有段落短于最小长度,如果有则与下一段合并 - i = 0 - while i < len(paragraphs) - 1: - if len(paragraphs[i]) < self.min_length: - # 合并当前段落和下一段落 - paragraphs[i] = paragraphs[i] + " " + paragraphs[i+1] - del paragraphs[i+1] - else: - i += 1 - - print(f"分割后段落数量: {len(paragraphs)}") - return paragraphs - -# 使用示例 -if __name__ == "__main__": - # 示例文本 - sample_text = """ -最近,晓蕾又上热搜了! - -咋回事呢?原来,她和老公刘剑一起开了直播带货的副业。但特意声明:她早就离开了上海电视台的编制,也不拿电视台的工资。换句话说,现在卖东西,完全是私营业态。 - -这事儿一下子引爆了大家的八卦魂。毕竟,明星主持扎堆直播间,也不算新鲜事。但还是挺多人纳闷:这些当年的 "话筒头牌",是不是集体选择摆烂了? - -其实,晓蕾和刘剑干脆落落大方,在直播间直接回应了这点。俩人意思很明确:“我们不是来拉低职业口碑的”。而且还耐心解释了自己转行的理由。 -曾经的大佬,变成了烟火气 - -说到晓蕾,不了解点她背景都不好意思讨论人家。当年上视新闻部的 "当家花旦",光学历和气质,足够秒杀隔壁主持圈的八条街。而刘剑,早年可是 "台柱子",播音腔精致到令人耳膜怀孕。照理来说,这样一对,在编制铁饭碗里躺平一辈子没毛病。 - -可人家偏不。 - -晓蕾说过这样一句话:“其实,我就是个普通人。” 真的那么普通吗?她不这么说,没人敢忘了她的标杆履历啊!她离开台里后,居然一头扎进了童语言教育这个赛道,一干就是十年,让机构做到了业内小圈子的爆款水准。 - -而这次直播,打的商品也不混乱,主打性价比和实用属性,晓蕾每件商品还得亲测过。如果你觉得她自吹自擂,建议去看看她直播间的粉丝评论。大家的意思是:晓蕾推品 = 放心买。 -刘剑这枚 “前一哥”,更狠! - -说晓蕾牛,别忘了,刘剑十年前也上演了一场 “豪赌”。那个年代,辞去电视台稳定工作,和 “打水漂” 差不多。 - -可是刘剑敢把梭全下,为啥?因为他看中了播音考生和辅导课程的市场,那时还没有多少人扎堆干这块,他觉得这是个机会。 - -果然,就这么辞了职,工作的腰板从跟组织吃工资,摇身变成了名副其实的事业单位 —— 自己家老板。虽然后来也是磕磕绊绊,但终究从试验田里掘出了一片肥沃地。 -主持人的 “下海”,是换方向走 - -有人觉得,曾经的新闻人、主持人 “跑去带货”,肯定是混不下去了。你要放在十年前,这种联想不稀奇,可现在不一样了。大环境变了,传统媒体是真的在互联网时代被打败得找不到调。 - -原来电视频道的观众,现在早转移到手机端,看知乎、刷短视频,甚至晚上蹲个带货直播会。你说新闻节目的高冷主播,现在换脸做带货主持,是不是 “落魄”?未必。 - -晓蕾夫妻这一波,实际上是转型很成功的范例。不管带啥网红货,他们俩把品质第一的逻辑摆明白了。这样的主播,不止卖产品,更卖信誉,靠着时间积攒了观众的信任。 -直播间哪门子 LOW?明明是主战场 - -网友说得有趣:“谁嫌直播带货 LOW,谁就输定了。” 道理没跑儿,移动互联网成了咱生活重心,生意也跟着迁移。这是明显趋势,看不懂的还真不想赚钱了。 - -而且,做直播一点不轻松。站几个小时口播、随时照顾弹幕情绪,这比坐着念提词器辛苦多了。像晓蕾和刘剑这样的 “摸鱼资历”,能转过身来赚饭钱,这不是 “混”,是 “拼” 啊。 - -别说传统意义的职业崇拜消失殆尽,你觉得稳如狗的岗位,说散架就散。老一辈金饭碗情结,对于下一代新创别说香,而是种被淘汰跑赢速度内心创新积极点。 - -我不是电视台员工了,早就离职 10 年了。""" - - # 创建分段器实例,设置目标段落长度范围 - splitter = TextSplitter(min_length=10, max_length=20) - - # 分割文本 - paragraphs = splitter.split_text(sample_text) - - # 打印结果 - print("\n分割结果:") - for i, para in enumerate(paragraphs, 1): - - print(para) - \ No newline at end of file