修改逻辑,优化代码
This commit is contained in:
parent
86417f4e69
commit
7ebf20ed6c
@ -129,11 +129,18 @@ class BatchProcessor:
|
||||
# 生成DOCX
|
||||
def update_file_progress(progress: int, text: str):
|
||||
if progress_callback:
|
||||
# 优化进度计算,避免浮点数精度问题
|
||||
# 确保当前文件进度在0-100范围内
|
||||
file_progress = max(0, min(100, progress))
|
||||
|
||||
# 计算整体进度:当前文件的进度在总进度中的占比
|
||||
file_weight = 1.0 / total_count
|
||||
current_file_progress = current_index + (progress / 100.0)
|
||||
overall_progress = int((current_file_progress / total_count) * 100)
|
||||
progress_callback(overall_progress, f"{pair['txt']['name']}: {text}")
|
||||
if total_count > 0:
|
||||
# 使用整数运算避免浮点数精度问题
|
||||
overall_progress = (current_index * 100 + file_progress) // total_count
|
||||
overall_progress = max(0, min(100, overall_progress))
|
||||
progress_callback(int(overall_progress), f"{pair['txt']['name']}: {text}")
|
||||
else:
|
||||
progress_callback(file_progress, f"{pair['txt']['name']}: {text}")
|
||||
|
||||
success = self.docx_generator.generate(sections, image_files, output_path, update_file_progress)
|
||||
|
||||
@ -148,7 +155,7 @@ class BatchProcessor:
|
||||
return result
|
||||
|
||||
def validate_batch_input(self, txt_folder: str, images_root: str,
|
||||
output_root: str = None) -> Dict[str, Any]:
|
||||
output_root: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
验证批量处理的输入参数
|
||||
|
||||
|
||||
15
config.json
15
config.json
@ -1,15 +0,0 @@
|
||||
{
|
||||
"text_order_conversion": false,
|
||||
"typo_handling": false,
|
||||
"punctuation_replacement": false,
|
||||
"paragraph_formatting": false,
|
||||
"paragraph_min_length": 100,
|
||||
"paragraph_max_length": 300,
|
||||
"typo_intensity": 0.5,
|
||||
"custom_punctuation": ",。!?;?!;",
|
||||
"output_path": "",
|
||||
"use_same_folder": true,
|
||||
"last_txt_folder": "",
|
||||
"last_images_root": "",
|
||||
"last_output_root": ""
|
||||
}
|
||||
@ -226,49 +226,19 @@ class DocxGenerator:
|
||||
para = doc.add_paragraph(style='List Bullet')
|
||||
self._apply_inline_formatting(para, content)
|
||||
# 应用列表样式
|
||||
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.unordered_list:
|
||||
list_style = self.current_document_style.unordered_list
|
||||
if list_style.paragraph:
|
||||
if list_style.paragraph.space_before > 0:
|
||||
para.paragraph_format.space_before = Pt(list_style.paragraph.space_before)
|
||||
if list_style.paragraph.space_after > 0:
|
||||
para.paragraph_format.space_after = Pt(list_style.paragraph.space_after)
|
||||
self._apply_list_style(para, 'unordered')
|
||||
|
||||
elif element_type == 'ordered_list':
|
||||
para = doc.add_paragraph(style='List Number')
|
||||
self._apply_inline_formatting(para, content)
|
||||
# 应用列表样式
|
||||
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.ordered_list:
|
||||
list_style = self.current_document_style.ordered_list
|
||||
if list_style.paragraph:
|
||||
if list_style.paragraph.space_before > 0:
|
||||
para.paragraph_format.space_before = Pt(list_style.paragraph.space_before)
|
||||
if list_style.paragraph.space_after > 0:
|
||||
para.paragraph_format.space_after = Pt(list_style.paragraph.space_after)
|
||||
self._apply_list_style(para, 'ordered')
|
||||
|
||||
elif element_type == 'blockquote':
|
||||
para = doc.add_paragraph(style='Quote')
|
||||
self._apply_inline_formatting(para, content)
|
||||
# 应用引用样式
|
||||
if hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block:
|
||||
quote_style = self.current_document_style.quote_block
|
||||
if quote_style.paragraph:
|
||||
if quote_style.paragraph.line_spacing > 0:
|
||||
para.paragraph_format.line_spacing = quote_style.paragraph.line_spacing
|
||||
if quote_style.paragraph.space_before > 0:
|
||||
para.paragraph_format.space_before = Pt(quote_style.paragraph.space_before)
|
||||
if quote_style.paragraph.space_after > 0:
|
||||
para.paragraph_format.space_after = Pt(quote_style.paragraph.space_after)
|
||||
if quote_style.paragraph.first_line_indent > 0:
|
||||
para.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12)
|
||||
|
||||
# 设置对齐方式
|
||||
if quote_style.paragraph.alignment == "center":
|
||||
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
elif quote_style.paragraph.alignment == "right":
|
||||
para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||||
elif quote_style.paragraph.alignment == "justify":
|
||||
para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
||||
self._apply_quote_style(para)
|
||||
|
||||
elif element_type == 'code_block':
|
||||
self._add_code_block(doc, element.get('content', ''), element.get('language', ''))
|
||||
@ -282,6 +252,58 @@ class DocxGenerator:
|
||||
elif element_type == 'empty':
|
||||
doc.add_paragraph()
|
||||
|
||||
def _apply_list_style(self, paragraph, list_type: str) -> None:
|
||||
"""
|
||||
应用列表样式到段落
|
||||
|
||||
Args:
|
||||
paragraph: DOCX段落对象
|
||||
list_type: 列表类型 ('unordered' 或 'ordered')
|
||||
"""
|
||||
if not (hasattr(self, 'current_document_style') and self.current_document_style):
|
||||
return
|
||||
|
||||
list_style = None
|
||||
if list_type == 'unordered' and self.current_document_style.unordered_list:
|
||||
list_style = self.current_document_style.unordered_list
|
||||
elif list_type == 'ordered' and self.current_document_style.ordered_list:
|
||||
list_style = self.current_document_style.ordered_list
|
||||
|
||||
if list_style and list_style.paragraph:
|
||||
if list_style.paragraph.space_before > 0:
|
||||
paragraph.paragraph_format.space_before = Pt(list_style.paragraph.space_before)
|
||||
if list_style.paragraph.space_after > 0:
|
||||
paragraph.paragraph_format.space_after = Pt(list_style.paragraph.space_after)
|
||||
|
||||
def _apply_quote_style(self, paragraph) -> None:
|
||||
"""
|
||||
应用引用块样式到段落
|
||||
|
||||
Args:
|
||||
paragraph: DOCX段落对象
|
||||
"""
|
||||
if not (hasattr(self, 'current_document_style') and self.current_document_style and self.current_document_style.quote_block):
|
||||
return
|
||||
|
||||
quote_style = self.current_document_style.quote_block
|
||||
if quote_style.paragraph:
|
||||
if quote_style.paragraph.line_spacing > 0:
|
||||
paragraph.paragraph_format.line_spacing = quote_style.paragraph.line_spacing
|
||||
if quote_style.paragraph.space_before > 0:
|
||||
paragraph.paragraph_format.space_before = Pt(quote_style.paragraph.space_before)
|
||||
if quote_style.paragraph.space_after > 0:
|
||||
paragraph.paragraph_format.space_after = Pt(quote_style.paragraph.space_after)
|
||||
if quote_style.paragraph.first_line_indent > 0:
|
||||
paragraph.paragraph_format.first_line_indent = Pt(quote_style.paragraph.first_line_indent * 12)
|
||||
|
||||
# 设置对齐方式
|
||||
if quote_style.paragraph.alignment == "center":
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
elif quote_style.paragraph.alignment == "right":
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
||||
elif quote_style.paragraph.alignment == "justify":
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
|
||||
|
||||
def _add_formatted_paragraph(self, doc: DocxDocument, content: str) -> None:
|
||||
"""
|
||||
添加带格式的段落
|
||||
@ -551,20 +573,23 @@ class DocxGenerator:
|
||||
output_path: 输出文件路径(用于临时文件)
|
||||
"""
|
||||
try:
|
||||
# 处理图片
|
||||
img, width = ImageProcessor.process_image(image_path)
|
||||
# 使用优化方法处理图片
|
||||
temp_dir = os.path.dirname(output_path)
|
||||
optimized_image_path = ImageProcessor.optimize_image_for_docx(image_path, temp_dir)
|
||||
|
||||
# 处理图片(方向修正和尺寸调整)
|
||||
img, width = ImageProcessor.process_image(optimized_image_path)
|
||||
|
||||
temp_img_path = None
|
||||
if config.image_resize == "width":
|
||||
# 需要保存临时图片
|
||||
temp_dir = os.path.dirname(output_path)
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
temp_img_path = os.path.join(temp_dir, f"temp_img_{hash(image_path)}.png")
|
||||
img.save(temp_img_path)
|
||||
self.temp_files.append(temp_img_path)
|
||||
img_path = temp_img_path
|
||||
else:
|
||||
img_path = image_path
|
||||
img_path = optimized_image_path if optimized_image_path != image_path else image_path
|
||||
|
||||
# 创建段落并插入图片
|
||||
para = doc.add_paragraph()
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
|
||||
import os
|
||||
import glob
|
||||
from typing import List, Dict, Any
|
||||
from typing import List, Dict, Any, Optional
|
||||
from config import config
|
||||
|
||||
|
||||
@ -67,21 +67,37 @@ class FileHandler:
|
||||
if not os.path.isdir(images_root):
|
||||
raise Exception(f"图片根文件夹不存在: {images_root}")
|
||||
|
||||
# 获取所有图片文件夹
|
||||
all_image_folders = []
|
||||
matched_pairs = []
|
||||
|
||||
# 优化:直接在遍历过程中进行匹配,避免先获取所有文件夹
|
||||
for txt in txt_files:
|
||||
matches = []
|
||||
txt_name = txt["name"].lower()
|
||||
|
||||
# 遍历图片根目录下的所有子目录进行匹配
|
||||
for root, dirs, _ in os.walk(images_root):
|
||||
for dir_name in dirs:
|
||||
folder_path = os.path.join(root, dir_name)
|
||||
all_image_folders.append({
|
||||
folder_name = dir_name.lower()
|
||||
|
||||
if config.match_pattern == "exact" and txt_name == folder_name:
|
||||
matches.append({
|
||||
"path": folder_path,
|
||||
"name": dir_name,
|
||||
"relative_path": os.path.relpath(folder_path, images_root)
|
||||
})
|
||||
elif config.match_pattern == "prefix" and folder_name.startswith(txt_name):
|
||||
matches.append({
|
||||
"path": folder_path,
|
||||
"name": dir_name,
|
||||
"relative_path": os.path.relpath(folder_path, images_root)
|
||||
})
|
||||
elif config.match_pattern == "contains" and txt_name in folder_name:
|
||||
matches.append({
|
||||
"path": folder_path,
|
||||
"name": dir_name,
|
||||
"relative_path": os.path.relpath(folder_path, images_root)
|
||||
})
|
||||
|
||||
matched_pairs = []
|
||||
|
||||
for txt in txt_files:
|
||||
matches = FileHandler._find_matches_for_txt(txt, all_image_folders)
|
||||
|
||||
if matches:
|
||||
# 选择最短路径的匹配项
|
||||
@ -100,33 +116,6 @@ class FileHandler:
|
||||
|
||||
return matched_pairs
|
||||
|
||||
@staticmethod
|
||||
def _find_matches_for_txt(txt_info: Dict[str, str], image_folders: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
||||
"""
|
||||
为单个TXT文件查找匹配的图片文件夹
|
||||
|
||||
Args:
|
||||
txt_info: TXT文件信息
|
||||
image_folders: 所有图片文件夹信息列表
|
||||
|
||||
Returns:
|
||||
List[Dict[str, str]]: 匹配的图片文件夹列表
|
||||
"""
|
||||
matches = []
|
||||
txt_name = txt_info["name"].lower()
|
||||
|
||||
for img_folder in image_folders:
|
||||
folder_name = img_folder["name"].lower()
|
||||
|
||||
if config.match_pattern == "exact" and txt_name == folder_name:
|
||||
matches.append(img_folder)
|
||||
elif config.match_pattern == "prefix" and folder_name.startswith(txt_name):
|
||||
matches.append(img_folder)
|
||||
elif config.match_pattern == "contains" and txt_name in folder_name:
|
||||
matches.append(img_folder)
|
||||
|
||||
return matches
|
||||
|
||||
@staticmethod
|
||||
def get_image_files(folder_path: str) -> List[str]:
|
||||
"""
|
||||
@ -142,17 +131,16 @@ class FileHandler:
|
||||
return []
|
||||
|
||||
image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.gif', '*.webp', '*.tiff']
|
||||
image_files = []
|
||||
image_files = set() # 使用集合去重
|
||||
|
||||
for ext in image_extensions:
|
||||
# 优化:只进行一次glob搜索,同时匹配大小写
|
||||
pattern = os.path.join(folder_path, ext)
|
||||
image_files.extend(glob.glob(pattern))
|
||||
# 也检查大写扩展名
|
||||
image_files.update(glob.glob(pattern))
|
||||
pattern_upper = os.path.join(folder_path, ext.upper())
|
||||
image_files.extend(glob.glob(pattern_upper))
|
||||
image_files.update(glob.glob(pattern_upper))
|
||||
|
||||
# 去重(防止大小写扩展名重复)
|
||||
image_files = list(set(image_files))
|
||||
image_files = list(image_files) # 转换回列表
|
||||
|
||||
# 根据配置排序
|
||||
if config.image_sort_by == "name":
|
||||
@ -234,7 +222,7 @@ class FileHandler:
|
||||
return output_path
|
||||
|
||||
@staticmethod
|
||||
def validate_paths(txt_folder: str, images_root: str, output_root: str = None) -> Dict[str, bool]:
|
||||
def validate_paths(txt_folder: str, images_root: str, output_root: Optional[str] = None) -> Dict[str, bool]:
|
||||
"""
|
||||
验证路径的有效性
|
||||
|
||||
|
||||
@ -9,12 +9,9 @@ from tkinter import ttk, filedialog, messagebox, simpledialog
|
||||
from typing import Optional
|
||||
|
||||
from style_manager import style_manager, DocumentStyle
|
||||
from config import Config
|
||||
from config import config as config_manager
|
||||
from advanced_style_editor import open_advanced_editor
|
||||
|
||||
# 创建配置管理器实例
|
||||
config_manager = Config()
|
||||
|
||||
|
||||
def create_style_tab(parent):
|
||||
"""创建样式管理选项卡"""
|
||||
@ -41,7 +38,8 @@ def create_style_tab(parent):
|
||||
|
||||
def on_style_change(*args):
|
||||
config_manager.current_style = style_var.get()
|
||||
config_manager.save_to_file('config.json')
|
||||
from config import CONFIG_FILE_PATH
|
||||
config_manager.save_to_file(CONFIG_FILE_PATH)
|
||||
_update_style_info()
|
||||
|
||||
style_var.trace('w', on_style_change)
|
||||
|
||||
@ -394,6 +394,10 @@ def replace_text(text):
|
||||
run_tests()
|
||||
sys.exit(0)
|
||||
|
||||
# 实际的文本处理逻辑
|
||||
processor = TextProcessor(min_length=30)
|
||||
return processor.process_text(text)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 命令行模式
|
||||
if len(sys.argv) > 1:
|
||||
|
||||
@ -174,6 +174,33 @@ class StyleManager:
|
||||
styles = {}
|
||||
|
||||
# 1. 爆款文章风格 - 参考知乎、头条等平台
|
||||
styles["爆款文章风格"] = self._create_viral_style()
|
||||
|
||||
# 2. 微信公众号风格 - 专业的新媒体排版
|
||||
styles["微信公众号风格"] = self._create_wechat_style()
|
||||
|
||||
# 3. 知乎高赞回答风格 - 逻辑清晰,层次分明
|
||||
styles["知乎高赞回答风格"] = self._create_zhihu_style()
|
||||
|
||||
# 4. 小红书笔记风格 - 清新文艺,少女心
|
||||
styles["小红书笔记风格"] = self._create_xiaohongshu_style()
|
||||
|
||||
# 5. 今日头条新闻风格 - 信息量大,节奏紧凑
|
||||
styles["今日头条新闻风格"] = self._create_toutiao_style()
|
||||
|
||||
# 6. B站UP主视频脚本风格 - 轻松活泼,年轻化
|
||||
styles["B站UP主视频脚本风格"] = self._create_bilibili_style()
|
||||
|
||||
# 7. 企业微信群通知风格 - 正式严肃
|
||||
styles["企业微信群通知风格"] = self._create_enterprise_style()
|
||||
|
||||
# 8. 情感鸡汤文风格 - 温暖治愈
|
||||
styles["情感鸡汤文风格"] = self._create_emotional_style()
|
||||
|
||||
return styles
|
||||
|
||||
def _create_viral_style(self) -> DocumentStyle:
|
||||
"""创建爆款文章风格"""
|
||||
viral_style = DocumentStyle(
|
||||
name="爆款文章风格",
|
||||
description="高阅读量爆款文章风格,层次分明,吸引眼球",
|
||||
@ -223,9 +250,10 @@ class StyleManager:
|
||||
background_color="#F8F9FA"
|
||||
)
|
||||
|
||||
styles["爆款文章风格"] = viral_style
|
||||
return viral_style
|
||||
|
||||
# 2. 微信公众号风格 - 专业的新媒体排版
|
||||
def _create_wechat_style(self) -> DocumentStyle:
|
||||
"""创建微信公众号风格"""
|
||||
wechat_style = DocumentStyle(
|
||||
name="微信公众号风格",
|
||||
description="专业的微信公众号排版,阅读体验佳",
|
||||
@ -267,9 +295,10 @@ class StyleManager:
|
||||
border=True
|
||||
)
|
||||
|
||||
styles["微信公众号风格"] = wechat_style
|
||||
return wechat_style
|
||||
|
||||
# 3. 知乎高赞回答风格 - 逻辑清晰,层次分明
|
||||
def _create_zhihu_style(self) -> DocumentStyle:
|
||||
"""创建知乎高赞回答风格"""
|
||||
zhihu_style = DocumentStyle(
|
||||
name="知乎高赞回答风格",
|
||||
description="逻辑清晰,层次分明,专业权威",
|
||||
@ -300,9 +329,10 @@ class StyleManager:
|
||||
line_spacing=1.3, space_before=10, space_after=8
|
||||
)
|
||||
|
||||
styles["知乎高赞回答风格"] = zhihu_style
|
||||
return zhihu_style
|
||||
|
||||
# 4. 小红书笔记风格 - 清新文艺,少女心
|
||||
def _create_xiaohongshu_style(self) -> DocumentStyle:
|
||||
"""创建小红书笔记风格"""
|
||||
xiaohongshu_style = DocumentStyle(
|
||||
name="小红书笔记风格",
|
||||
description="清新文艺,适合生活方式类内容",
|
||||
@ -332,9 +362,10 @@ class StyleManager:
|
||||
line_spacing=1.3, space_before=8, space_after=6
|
||||
)
|
||||
|
||||
styles["小红书笔记风格"] = xiaohongshu_style
|
||||
return xiaohongshu_style
|
||||
|
||||
# 5. 今日头条新闻风格 - 信息量大,节奏紧凑
|
||||
def _create_toutiao_style(self) -> DocumentStyle:
|
||||
"""创建今日头条新闻风格"""
|
||||
toutiao_style = DocumentStyle(
|
||||
name="今日头条新闻风格",
|
||||
description="信息密度高,节奏紧凑,突出重点",
|
||||
@ -364,9 +395,10 @@ class StyleManager:
|
||||
line_spacing=1.3, space_before=8, space_after=6
|
||||
)
|
||||
|
||||
styles["今日头条新闻风格"] = toutiao_style
|
||||
return toutiao_style
|
||||
|
||||
# 6. B站UP主视频脚本风格 - 轻松活泼,年轻化
|
||||
def _create_bilibili_style(self) -> DocumentStyle:
|
||||
"""创建B站UP主视频脚本风格"""
|
||||
bilibili_style = DocumentStyle(
|
||||
name="B站UP主视频脚本风格",
|
||||
description="轻松活泼,适合年轻受众,有趣有料",
|
||||
@ -396,9 +428,10 @@ class StyleManager:
|
||||
line_spacing=1.3, space_before=8, space_after=6
|
||||
)
|
||||
|
||||
styles["B站UP主视频脚本风格"] = bilibili_style
|
||||
return bilibili_style
|
||||
|
||||
# 7. 企业微信群通知风格 - 正式严肃
|
||||
def _create_enterprise_style(self) -> DocumentStyle:
|
||||
"""创建企业微信群通知风格"""
|
||||
enterprise_style = DocumentStyle(
|
||||
name="企业微信群通知风格",
|
||||
description="正式严肃,信息传达清晰,商务风格",
|
||||
@ -423,9 +456,10 @@ class StyleManager:
|
||||
line_spacing=1.3, space_before=12, space_after=8
|
||||
)
|
||||
|
||||
styles["企业微信群通知风格"] = enterprise_style
|
||||
return enterprise_style
|
||||
|
||||
# 8. 情感鸡汤文风格 - 温暖治愈
|
||||
def _create_emotional_style(self) -> DocumentStyle:
|
||||
"""创建情感鸡汤文风格"""
|
||||
emotional_style = DocumentStyle(
|
||||
name="情感鸡汤文风格",
|
||||
description="温暖治愈,情感丰富,適合心灵鸡汤类内容",
|
||||
@ -460,9 +494,7 @@ class StyleManager:
|
||||
background_color="#FFF3E0"
|
||||
)
|
||||
|
||||
styles["情感鸡汤文风格"] = emotional_style
|
||||
|
||||
return styles
|
||||
return emotional_style
|
||||
|
||||
def _load_custom_styles(self) -> None:
|
||||
"""加载自定义样式"""
|
||||
|
||||
438
test.py
Normal file
438
test.py
Normal file
@ -0,0 +1,438 @@
|
||||
import re
|
||||
import random
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
import jieba
|
||||
import jieba.posseg as pseg
|
||||
|
||||
|
||||
class EnhancedArticleRewriter:
|
||||
"""
|
||||
增强版文章智能改写工具
|
||||
实现更自然的句子变化,符合人工书写逻辑
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# 标点符号定义
|
||||
self.sentence_endings = ['。', '!', '?', '…']
|
||||
self.pause_marks = [',', ';', ':', '、']
|
||||
|
||||
# 句子长度分布(模拟人工书写习惯)
|
||||
self.sentence_length_distribution = {
|
||||
'short': (5, 15), # 短句
|
||||
'medium': (16, 30), # 中句
|
||||
'long': (31, 50), # 长句
|
||||
'extra_long': (51, 80) # 超长句
|
||||
}
|
||||
|
||||
# 连接词库(更丰富的连接词)
|
||||
self.connectors = {
|
||||
'sequence': ['随后', '接着', '然后', '紧接着', '继而', '进而'],
|
||||
'addition': ['并且', '同时', '此外', '另外', '再者', '况且', '而且'],
|
||||
'contrast': ['但是', '然而', '不过', '可是', '却', '反而', '相反'],
|
||||
'cause': ['因为', '由于', '因此', '所以', '故而', '从而'],
|
||||
'condition': ['如果', '假如', '倘若', '若是', '要是'],
|
||||
'concession': ['虽然', '尽管', '即使', '纵然', '固然'],
|
||||
'summary': ['总之', '综上', '总的来说', '概括地说', '简言之'],
|
||||
'example': ['比如', '例如', '譬如', '好比', '正如'],
|
||||
'emphasis': ['特别是', '尤其是', '更重要的是', '值得注意的是'],
|
||||
'explanation': ['也就是说', '换句话说', '具体来说', '准确地说']
|
||||
}
|
||||
|
||||
# 句式模板
|
||||
self.sentence_patterns = {
|
||||
'statement': ['{}'], # 陈述句
|
||||
'emphasis_front': ['值得注意的是,{}', '需要强调的是,{}', '重要的是,{}'],
|
||||
'emphasis_back': ['{},这一点尤为重要', '{},这是关键所在'],
|
||||
'question_rhetorical': ['难道不是{}吗?', '{},不是吗?'],
|
||||
'parallel': ['不仅{},而且{}', '既{},又{}', '一方面{},另一方面{}'],
|
||||
'progressive': ['先是{},然后{}', '从{}到{}', '由{}发展到{}']
|
||||
}
|
||||
|
||||
# 同义词/近义词替换库
|
||||
self.synonyms = {
|
||||
'发展': ['演进', '进步', '演变', '发展', '进化', '提升', '推进'],
|
||||
'改变': ['变化', '转变', '改变', '变革', '转换', '调整', '革新'],
|
||||
'重要': ['关键', '重要', '核心', '主要', '根本', '要紧', '关键性'],
|
||||
'影响': ['作用', '影响', '效应', '冲击', '波及', '涉及'],
|
||||
'提高': ['提升', '增强', '改善', '优化', '加强', '增进'],
|
||||
'显示': ['表明', '显示', '说明', '揭示', '体现', '反映', '展现'],
|
||||
'通过': ['利用', '运用', '借助', '凭借', '依靠', '经由'],
|
||||
'实现': ['达成', '实现', '完成', '达到', '做到', '落实'],
|
||||
'问题': ['难题', '问题', '挑战', '困难', '障碍', '瓶颈'],
|
||||
'方法': ['方式', '手段', '途径', '办法', '策略', '措施'],
|
||||
'需要': ['需要', '要求', '必须', '应该', '亟需', '急需'],
|
||||
'能够': ['能够', '可以', '能', '可', '得以', '足以'],
|
||||
'非常': ['十分', '相当', '特别', '格外', '极其', '异常', '颇为'],
|
||||
'很多': ['许多', '大量', '众多', '诸多', '不少', '大批'],
|
||||
'所有': ['全部', '一切', '所有', '整个', '全体', '各个'],
|
||||
'已经': ['已', '已经', '业已', '早已', '都已'],
|
||||
'正在': ['正', '正在', '在', '正处于', '目前正'],
|
||||
'越来越': ['日益', '愈发', '愈加', '更加', '日渐', '渐渐'],
|
||||
'不断': ['持续', '不断', '连续', '陆续', '继续', '频繁'],
|
||||
'各种': ['各类', '各种', '多种', '种种', '诸般', '多样'],
|
||||
}
|
||||
|
||||
def _get_random_sentence_length_type(self) -> str:
|
||||
"""根据正态分布随机选择句子长度类型"""
|
||||
# 模拟人工书写的句长分布:中句最多,短句和长句次之,超长句最少
|
||||
weights = {'short': 25, 'medium': 40, 'long': 25, 'extra_long': 10}
|
||||
types = list(weights.keys())
|
||||
probs = [weights[t] / 100 for t in types]
|
||||
return random.choices(types, weights=probs)[0]
|
||||
|
||||
def _smart_split_merge_sentences(self, sentences: List[str]) -> List[str]:
|
||||
"""智能拆分和合并句子,创造自然的长短句节奏"""
|
||||
if not sentences:
|
||||
return sentences
|
||||
|
||||
result = []
|
||||
i = 0
|
||||
|
||||
while i < len(sentences):
|
||||
# 获取目标句长类型
|
||||
target_type = self._get_random_sentence_length_type()
|
||||
min_len, max_len = self.sentence_length_distribution[target_type]
|
||||
|
||||
current_sentence = sentences[i].strip()
|
||||
current_len = len(current_sentence)
|
||||
|
||||
# 如果当前句子太长,尝试拆分
|
||||
if current_len > max_len:
|
||||
split_sentences = self._split_sentence_naturally(current_sentence, max_len)
|
||||
result.extend(split_sentences)
|
||||
|
||||
# 如果当前句子太短,尝试与下一句合并
|
||||
elif current_len < min_len and i + 1 < len(sentences):
|
||||
# 30%概率合并短句
|
||||
if random.random() < 0.3:
|
||||
merged = self._merge_sentences(current_sentence, sentences[i + 1])
|
||||
result.append(merged)
|
||||
i += 1 # 跳过下一句
|
||||
else:
|
||||
result.append(current_sentence)
|
||||
|
||||
# 长度合适,直接添加
|
||||
else:
|
||||
result.append(current_sentence)
|
||||
|
||||
i += 1
|
||||
|
||||
return result
|
||||
|
||||
def _split_sentence_naturally(self, sentence: str, max_length: int) -> List[str]:
|
||||
"""自然地拆分长句"""
|
||||
if len(sentence) <= max_length:
|
||||
return [sentence]
|
||||
|
||||
# 保存句尾标点
|
||||
ending = ''
|
||||
for mark in self.sentence_endings:
|
||||
if sentence.endswith(mark):
|
||||
ending = mark
|
||||
sentence = sentence[:-len(mark)]
|
||||
break
|
||||
|
||||
# 优先在逗号处拆分
|
||||
parts = []
|
||||
if ',' in sentence:
|
||||
segments = sentence.split(',')
|
||||
current = ""
|
||||
|
||||
for i, segment in enumerate(segments):
|
||||
if not current:
|
||||
current = segment
|
||||
elif len(current + ',' + segment) <= max_length:
|
||||
current += ',' + segment
|
||||
else:
|
||||
# 添加句号使其成为完整句子
|
||||
if random.random() < 0.7: # 70%概率添加句号
|
||||
parts.append(current + '。')
|
||||
else: # 30%概率使用其他句尾
|
||||
parts.append(current + random.choice(['。', '!', '']))
|
||||
current = segment
|
||||
|
||||
# 处理最后一部分
|
||||
if current:
|
||||
parts.append(current + ending)
|
||||
else:
|
||||
# 如果没有逗号,尝试在其他标点处拆分
|
||||
parts = [sentence + ending]
|
||||
|
||||
return parts if parts else [sentence + ending]
|
||||
|
||||
def _merge_sentences(self, sent1: str, sent2: str) -> str:
|
||||
"""智能合并两个句子"""
|
||||
# 移除第一个句子的句尾标点
|
||||
for mark in self.sentence_endings:
|
||||
if sent1.endswith(mark):
|
||||
sent1 = sent1[:-len(mark)]
|
||||
break
|
||||
|
||||
# 选择连接方式
|
||||
merge_type = random.choice(['comma', 'connector', 'semicolon'])
|
||||
|
||||
if merge_type == 'comma':
|
||||
return sent1 + ',' + sent2
|
||||
elif merge_type == 'connector':
|
||||
# 随机选择连接词类型
|
||||
conn_type = random.choice(list(self.connectors.keys()))
|
||||
connector = random.choice(self.connectors[conn_type])
|
||||
return sent1 + ',' + connector + sent2
|
||||
else: # semicolon
|
||||
return sent1 + ';' + sent2
|
||||
|
||||
def _replace_synonyms(self, text: str, intensity: float) -> str:
|
||||
"""同义词替换"""
|
||||
words = list(jieba.cut(text))
|
||||
result = []
|
||||
|
||||
for word in words:
|
||||
if word in self.synonyms and random.random() < intensity:
|
||||
# 选择一个同义词(避免选到原词)
|
||||
alternatives = [w for w in self.synonyms[word] if w != word]
|
||||
if alternatives:
|
||||
result.append(random.choice(alternatives))
|
||||
else:
|
||||
result.append(word)
|
||||
else:
|
||||
result.append(word)
|
||||
|
||||
return ''.join(result)
|
||||
|
||||
def _adjust_sentence_structure(self, sentence: str, intensity: float) -> str:
|
||||
"""调整句子结构,使其更自然"""
|
||||
if random.random() > intensity:
|
||||
return sentence
|
||||
|
||||
# 保存句尾标点
|
||||
ending = ''
|
||||
for mark in self.sentence_endings:
|
||||
if sentence.endswith(mark):
|
||||
ending = mark
|
||||
sentence = sentence[:-len(mark)]
|
||||
break
|
||||
|
||||
# 随机选择调整方式
|
||||
adjust_type = random.choice(['reorder', 'add_emphasis', 'change_pattern'])
|
||||
|
||||
if adjust_type == 'reorder' and ',' in sentence:
|
||||
# 重新排列子句
|
||||
parts = sentence.split(',')
|
||||
if len(parts) >= 2:
|
||||
# 智能重排:不是完全随机,而是有逻辑的调整
|
||||
if len(parts) == 2:
|
||||
# 两个子句直接交换
|
||||
sentence = parts[1] + ',' + parts[0]
|
||||
else:
|
||||
# 多个子句,将中间的提前或延后
|
||||
mid_idx = len(parts) // 2
|
||||
if random.random() < 0.5:
|
||||
# 中间提前
|
||||
parts = [parts[mid_idx]] + parts[:mid_idx] + parts[mid_idx + 1:]
|
||||
else:
|
||||
# 中间延后
|
||||
parts = parts[:mid_idx] + parts[mid_idx + 1:] + [parts[mid_idx]]
|
||||
sentence = ','.join(parts)
|
||||
|
||||
elif adjust_type == 'add_emphasis':
|
||||
# 添加强调
|
||||
if random.random() < 0.3:
|
||||
pattern = random.choice(self.sentence_patterns['emphasis_front'])
|
||||
sentence = pattern.format(sentence)
|
||||
elif random.random() < 0.3:
|
||||
pattern = random.choice(self.sentence_patterns['emphasis_back'])
|
||||
sentence = pattern.format(sentence)
|
||||
|
||||
elif adjust_type == 'change_pattern':
|
||||
# 改变句式
|
||||
if ',' in sentence and random.random() < 0.4:
|
||||
parts = sentence.split(',', 1)
|
||||
if len(parts) == 2:
|
||||
# 使用并列或递进句式
|
||||
if random.random() < 0.5:
|
||||
sentence = f"不仅{parts[0]},而且{parts[1]}"
|
||||
else:
|
||||
sentence = f"{parts[0]},进而{parts[1]}"
|
||||
|
||||
return sentence + ending
|
||||
|
||||
def _add_natural_variations(self, sentence: str, intensity: float) -> str:
|
||||
"""添加自然的语言变化"""
|
||||
if random.random() > intensity:
|
||||
return sentence
|
||||
|
||||
variations = []
|
||||
|
||||
# 20%概率添加过渡词
|
||||
if random.random() < 0.2:
|
||||
transition = random.choice(['其实', '事实上', '实际上', '确实', '显然'])
|
||||
variations.append(f"{transition},{sentence}")
|
||||
|
||||
# 15%概率添加程度副词
|
||||
if random.random() < 0.15:
|
||||
adverb = random.choice(['更', '更加', '尤其', '特别', '格外'])
|
||||
# 简单地在"是"、"有"、"能"等词前添加副词
|
||||
for verb in ['是', '有', '能', '会', '要']:
|
||||
if verb in sentence:
|
||||
sentence = sentence.replace(verb, f"{adverb}{verb}", 1)
|
||||
break
|
||||
|
||||
return variations[0] if variations else sentence
|
||||
|
||||
def rewrite(self, text: str, config: Dict = None) -> str:
|
||||
"""
|
||||
主函数:改写文章
|
||||
|
||||
参数:
|
||||
text: 输入文章
|
||||
config: 配置字典
|
||||
- intensity: 修改强度 0.0-1.0
|
||||
- preserve_meaning: 是否保持原意
|
||||
- natural_flow: 是否保持自然流畅
|
||||
- vary_sentence_length: 是否变化句长
|
||||
"""
|
||||
if config is None:
|
||||
config = {}
|
||||
|
||||
# 设置默认值
|
||||
config.setdefault('intensity', 0.6)
|
||||
config.setdefault('preserve_meaning', True)
|
||||
config.setdefault('natural_flow', True)
|
||||
config.setdefault('vary_sentence_length', True)
|
||||
|
||||
intensity = config['intensity']
|
||||
|
||||
# 分段处理
|
||||
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
|
||||
result_paragraphs = []
|
||||
|
||||
for para in paragraphs:
|
||||
# 分句
|
||||
sentences = self._split_sentences(para)
|
||||
|
||||
# 1. 首先进行句子长短调整
|
||||
if config['vary_sentence_length']:
|
||||
sentences = self._smart_split_merge_sentences(sentences)
|
||||
|
||||
# 2. 处理每个句子
|
||||
processed_sentences = []
|
||||
for i, sent in enumerate(sentences):
|
||||
# 同义词替换
|
||||
sent = self._replace_synonyms(sent, intensity * 0.5)
|
||||
|
||||
# 句子结构调整
|
||||
sent = self._adjust_sentence_structure(sent, intensity * 0.7)
|
||||
|
||||
# 添加自然变化
|
||||
sent = self._add_natural_variations(sent, intensity * 0.3)
|
||||
|
||||
processed_sentences.append(sent)
|
||||
|
||||
# 3. 段落重组(偶尔调整句子顺序)
|
||||
if len(processed_sentences) > 3 and random.random() < intensity * 0.2:
|
||||
# 20%概率微调句子顺序(只交换相邻句子)
|
||||
idx = random.randint(0, len(processed_sentences) - 2)
|
||||
processed_sentences[idx], processed_sentences[idx + 1] = \
|
||||
processed_sentences[idx + 1], processed_sentences[idx]
|
||||
|
||||
result_paragraphs.append(''.join(processed_sentences))
|
||||
|
||||
return '\n\n'.join(result_paragraphs)
|
||||
|
||||
def _split_sentences(self, text: str) -> List[str]:
|
||||
"""改进的句子分割"""
|
||||
# 处理多种句尾标点
|
||||
pattern = '([。!?…]+)'
|
||||
parts = re.split(pattern, text)
|
||||
|
||||
sentences = []
|
||||
for i in range(0, len(parts) - 1, 2):
|
||||
if parts[i].strip():
|
||||
sentences.append(parts[i] + parts[i + 1])
|
||||
|
||||
# 处理最后一个部分
|
||||
if len(parts) % 2 == 1 and parts[-1].strip():
|
||||
sentences.append(parts[-1] + '。') # 添加默认句号
|
||||
|
||||
return sentences
|
||||
|
||||
|
||||
def demo():
|
||||
"""使用示例"""
|
||||
sample_text = """
|
||||
最近,晓蕾又上热搜了!
|
||||
|
||||
咋回事呢?原来,她和老公刘剑一起开了直播带货的副业。但特意声明:她早就离开了上海电视台的编制,也不拿电视台的工资。换句话说,现在卖东西,完全是私营业态。
|
||||
|
||||
这事儿一下子引爆了大家的八卦魂。毕竟,明星主持扎堆直播间,也不算新鲜事。但还是挺多人纳闷:这些当年的 "话筒头牌",是不是集体选择摆烂了?
|
||||
|
||||
其实,晓蕾和刘剑干脆落落大方,在直播间直接回应了这点。俩人意思很明确:“我们不是来拉低职业口碑的”。而且还耐心解释了自己转行的理由。
|
||||
曾经的大佬,变成了烟火气
|
||||
|
||||
说到晓蕾,不了解点她背景都不好意思讨论人家。当年上视新闻部的 "当家花旦",光学历和气质,足够秒杀隔壁主持圈的八条街。而刘剑,早年可是 "台柱子",播音腔精致到令人耳膜怀孕。照理来说,这样一对,在编制铁饭碗里躺平一辈子没毛病。
|
||||
|
||||
可人家偏不。
|
||||
|
||||
晓蕾说过这样一句话:“其实,我就是个普通人。” 真的那么普通吗?她不这么说,没人敢忘了她的标杆履历啊!她离开台里后,居然一头扎进了童语言教育这个赛道,一干就是十年,让机构做到了业内小圈子的爆款水准。
|
||||
|
||||
而这次直播,打的商品也不混乱,主打性价比和实用属性,晓蕾每件商品还得亲测过。如果你觉得她自吹自擂,建议去看看她直播间的粉丝评论。大家的意思是:晓蕾推品 = ·9放心买。
|
||||
刘剑这枚 “前一哥”,更狠!
|
||||
|
||||
说晓蕾牛,别忘了,刘剑十年前也上演了一场 “豪赌”。那个年代,辞去电视台稳定工作,和 “打水漂” 差不多。
|
||||
|
||||
可是刘剑敢把梭全下,为啥?因为他看中了播音考生和辅导课程的市场,那时还没有多少人扎堆干这块,他觉得这是个机会。
|
||||
|
||||
果然,就这么辞了职,工作的腰板从跟组织吃工资,摇身变成了名副其实的事业单位 —— 自己家老板。虽然后来也是磕磕绊绊,但终究从试验田里掘出了一片肥沃地。
|
||||
主持人的 “下海”,是换方向走
|
||||
|
||||
有人觉得,曾经的新闻人、主持人 “跑去带货”,肯定是混不下去了。你要放在十年前,这种联想不稀奇,可现在不一样了。大环境变了,传统媒体是真的在互联网时代被打败得找不到调。
|
||||
|
||||
原来电视频道的观众,现在早转移到手机端,看知乎、刷短视频,甚至晚上蹲个带货直播会。你说新闻节目的高冷主播,现在换脸做带货主持,是不是 “落魄”?未必。
|
||||
|
||||
晓蕾夫妻这一波,实际上是转型很成功的范例。不管带啥网红货,他们俩把品质第一的逻辑摆明白了。这样的主播,不止卖产品,更卖信誉,靠着时间积攒了观众的信任。
|
||||
直播间哪门子 LOW?明明是主战场
|
||||
|
||||
网友说得有趣:“谁嫌直播带货 LOW,谁就输定了。” 道理没跑儿,移动互联网成了咱生活重心,生意也跟着迁移。这是明显趋势,看不懂的还真不想赚钱了。
|
||||
|
||||
而且,做直播一点不轻松。站几个小时口播、随时照顾弹幕情绪,这比坐着念提词器辛苦多了。像晓蕾和刘剑这样的 “摸鱼资历”,能转过身来赚饭钱,这不是 “混”,是 “拼” 啊。
|
||||
|
||||
别说传统意义的职业崇拜消失殆尽,你觉得稳如狗的岗位,说散架就散。老一辈金饭碗情结,对于下一代新创别说香,而是种被淘汰跑赢速度内心创新积极点。
|
||||
|
||||
我不是电视台员工了,早就离职 10 年了。
|
||||
"""
|
||||
|
||||
rewriter = EnhancedArticleRewriter()
|
||||
|
||||
print("=" * 60)
|
||||
print("原文:")
|
||||
print("=" * 60)
|
||||
print(sample_text)
|
||||
|
||||
# 测试不同强度的改写
|
||||
for intensity in [0.3, 0.6, 0.9]:
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"改写强度: {intensity}")
|
||||
print("=" * 60)
|
||||
|
||||
config = {
|
||||
'intensity': intensity,
|
||||
'preserve_meaning': True,
|
||||
'natural_flow': True,
|
||||
'vary_sentence_length': True
|
||||
}
|
||||
|
||||
result = rewriter.rewrite(sample_text, config)
|
||||
print(result)
|
||||
|
||||
# 统计句子长度分布
|
||||
sentences = re.split('[。!?…]+', result)
|
||||
lengths = [len(s) for s in sentences if s.strip()]
|
||||
if lengths:
|
||||
print(f"\n句子长度分布: 最短={min(lengths)}, 最长={max(lengths)}, 平均={sum(lengths) / len(lengths):.1f}")
|
||||
print(f"句子数量: {len(lengths)}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 注意:需要安装jieba库
|
||||
# pip install jieba
|
||||
demo()
|
||||
174
test_1.py
Normal file
174
test_1.py
Normal file
@ -0,0 +1,174 @@
|
||||
import re
|
||||
import jieba
|
||||
import random
|
||||
from typing import List
|
||||
|
||||
|
||||
class HeavyHumanizer:
|
||||
"""重度人类化改写器 - 保持逻辑,可读性高,适合绕过 AI 检测"""
|
||||
|
||||
def __init__(self):
|
||||
jieba.initialize()
|
||||
# 人类化开头/转折/插话
|
||||
self.openings = ['说到', '提到', '关于', '其实', '要说', '你知道吗', '顺便说']
|
||||
self.transitions = ['但是', '不过', '然而', '话说回来', '可惜的是', '偏偏']
|
||||
self.fillers = ['其实', '当然', '显然', '我觉得', '说起来', '顺便说']
|
||||
# 主观表达
|
||||
self.subjective = ['我认为', '我觉得', '在我看来', '就我所知', '据我了解']
|
||||
# 口语词汇
|
||||
self.colloquial_particles = ['呢', '吧', '啊', '哦', '嘛', '哈', '呀']
|
||||
# 高风险词替换
|
||||
self.high_risk = {
|
||||
'重要': ['关键', '核心', '主要'],
|
||||
'显著': ['明显', '突出', '很大'],
|
||||
'提升': ['提高', '增强', '改善'],
|
||||
'确保': ['保证', '做到', '维护'],
|
||||
'实施': ['执行', '开展', '推行']
|
||||
}
|
||||
# 分句标点
|
||||
self.sentence_endings = {'。', '!', '?', '.', '!', '?', '…', ';', ';'}
|
||||
|
||||
def split_sentences(self, text: str) -> List[str]:
|
||||
"""按照句子结束标点分割文本"""
|
||||
sentences = []
|
||||
current = ''
|
||||
for c in text:
|
||||
current += c
|
||||
if c in self.sentence_endings:
|
||||
sentences.append(current.strip())
|
||||
current = ''
|
||||
if current.strip():
|
||||
sentences.append(current.strip())
|
||||
return sentences
|
||||
|
||||
def replace_high_risk_words(self, sentence: str) -> str:
|
||||
"""替换高风险 AI 词汇"""
|
||||
for k, v_list in self.high_risk.items():
|
||||
if k in sentence and random.random() < 0.8:
|
||||
sentence = sentence.replace(k, random.choice(v_list))
|
||||
return sentence
|
||||
|
||||
def add_subjective_expressions(self, sentence: str) -> str:
|
||||
"""随机添加主观表达或口语词"""
|
||||
if random.random() < 0.3:
|
||||
expr = random.choice(self.subjective)
|
||||
sentence = expr + ',' + sentence
|
||||
if random.random() < 0.2:
|
||||
particle = random.choice(self.colloquial_particles)
|
||||
if sentence.endswith('。'):
|
||||
sentence = sentence[:-1] + particle + '。'
|
||||
if random.random() < 0.15:
|
||||
filler = random.choice(self.fillers)
|
||||
sentence = filler + ',' + sentence
|
||||
return sentence
|
||||
|
||||
def vary_sentence_length(self, sentences: List[str]) -> List[str]:
|
||||
"""打乱句子顺序或拆分长句,增加突发性"""
|
||||
varied = []
|
||||
i = 0
|
||||
while i < len(sentences):
|
||||
s = sentences[i]
|
||||
# 适度拆分长句
|
||||
if len(s) > 50 and random.random() < 0.5:
|
||||
mid = len(s) // 2
|
||||
# 找最近的逗号
|
||||
comma_pos = max(s.rfind(',', 0, mid), s.rfind(',', 0, mid))
|
||||
if comma_pos > 5:
|
||||
first = s[:comma_pos + 1].strip()
|
||||
second = s[comma_pos + 1:].strip()
|
||||
varied.extend([first, second])
|
||||
i += 1
|
||||
continue
|
||||
varied.append(s)
|
||||
i += 1
|
||||
# 打乱顺序但保持逻辑块
|
||||
if random.random() < 0.3:
|
||||
random.shuffle(varied)
|
||||
return varied
|
||||
|
||||
def create_paragraphs(self, sentences: List[str]) -> List[str]:
|
||||
"""根据句长和随机概率生成段落"""
|
||||
paragraphs = []
|
||||
current = []
|
||||
current_len = 0
|
||||
for s in sentences:
|
||||
current.append(s)
|
||||
current_len += len(s)
|
||||
if current_len > 80 and (random.random() < 0.4 or current_len > 150):
|
||||
paragraphs.append(''.join(current))
|
||||
current = []
|
||||
current_len = 0
|
||||
if current:
|
||||
paragraphs.append(''.join(current))
|
||||
return paragraphs
|
||||
|
||||
def humanize_text(self, text: str) -> str:
|
||||
"""核心人类化改写函数"""
|
||||
# 清理文本
|
||||
text = re.sub(r'\s+', '', text)
|
||||
sentences = self.split_sentences(text)
|
||||
|
||||
# 高风险词替换
|
||||
sentences = [self.replace_high_risk_words(s) for s in sentences]
|
||||
|
||||
# 添加主观表达、口语词、插话
|
||||
sentences = [self.add_subjective_expressions(s) for s in sentences]
|
||||
|
||||
# 增加长短句变化和突发性
|
||||
sentences = self.vary_sentence_length(sentences)
|
||||
|
||||
# 创建段落
|
||||
paragraphs = self.create_paragraphs(sentences)
|
||||
|
||||
# 格式化段落
|
||||
formatted = '\n\n'.join([' ' + p for p in paragraphs])
|
||||
return formatted
|
||||
|
||||
|
||||
# ================== 使用示例 ==================
|
||||
if __name__ == "__main__":
|
||||
text = """
|
||||
最近,晓蕾又上热搜了!
|
||||
|
||||
换句话说,现在卖东西,完全是私营业态。咋回事呢?但特意声明:她早就离开了上海电视台的编制,也不拿电视台的工资欸。原来,她和老公刘剑一起开了直播带货的副业。
|
||||
|
||||
这事儿一下子引爆了大家的八卦魂。毕竟,明星主持扎堆直播间,也不算新鲜事你说呢。
|
||||
|
||||
但还是挺多人纳闷:这些当年的 "话筒头牌",是不是集体选择摆烂了?
|
||||
|
||||
其实,晓蕾和刘剑干…俩人意思很明确:“我们不是来拉低职业口碑的”。而且还耐心解释了自己转行的理由,曾经的大佬,变成了烟火气。
|
||||
|
||||
说到晓蕾,不了解点她背景都不好意思讨论人家。当年上视新闻部的 "当家花旦",光学历和气质,足够秒杀隔壁主持圈的八条街。而刘剑,似乎早年可是 "台柱子",播音腔精致到令人耳膜怀孕嗯。照理来说,这样一对,在编制铁饭碗里躺平一辈子没毛病。
|
||||
|
||||
可人家偏不。
|
||||
|
||||
晓蕾说过这样一句话:“其实,我就是个普通人。” 真的那么普通吗?她不这么说,没人敢忘了她的标杆履历啊!她离开台里后,居然一头扎进了童语言教育这个赛道,一干就是十年,让机构做到了业内小圈子的爆款水准。
|
||||
|
||||
而这次直播,打的商品也不混乱,主打性价比和实用属性,晓蕾每件商品还得亲测过。如果你觉得她自吹自擂,建议去看看她直播间的粉丝评论。大家地意思是:晓蕾推品 = 放心买。
|
||||
|
||||
刘剑这枚 “前一哥”,更狠!
|
||||
|
||||
说晓蕾牛,看起来别忘了,刘剑十年前也上演了一场 “豪赌”。那个年代,辞去电视台稳定工作,和 “打水漂” 差不多。
|
||||
|
||||
可是刘剑敢把梭全下,为啥?因为他看中了播音考生和辅导课程的市场,那时还没有多少人扎堆干这块,他觉得这是个机会。
|
||||
|
||||
果然,就这么辞了职,工作的腰板从跟组织吃工资,摇身变成了名副其实的事业单位 —— 自己家老板。虽然后来也是磕磕绊绊,但终究从试验田里掘出了一片肥沃地。主持人的 “下海”,是换方向走。
|
||||
|
||||
有人觉得,曾经的新闻人、主持人 “跑去带货”,肯定是混不下去了。你要放在十年前,这种联想不稀奇,可现在不一样了。大环境变了,看起来传统媒体是真的在互联网时代被打败得找不到调。
|
||||
|
||||
原来电视频道的观众,现在早转移到手机端,看知乎、刷短视频,甚至晚上蹲个带货直播会。就像我说的,我认为,你说新闻节目的高冷主播,现在换脸做带货主持,是不是 “落魄”?未必。
|
||||
|
||||
其实,晓蕾夫妻这一波,实际上是转型很成功的范例。不管带啥网红货,可能他们俩把品质第一的逻辑摆明白了啊。这样的主播,不止卖产品,更卖信誉,靠着时间积攒了观众的信任嗯。也许,直播间哪门子 LOW?明明是主战场。
|
||||
|
||||
网友说得有趣:“谁嫌直播带货 LOW,谁就输定了。” 道理没跑儿,似乎移动互联网成了咱生活重心,生意也跟着迁移啊。
|
||||
|
||||
这是明显趋势,看不懂的还真不想赚钱了。
|
||||
|
||||
而且,似乎做直播一点不轻松。站几个小时口播、随时照顾弹幕情绪,这比坐着念提词器辛苦多了。其实,像晓蕾和刘剑这样的 “摸鱼资历”,能转过身来赚饭钱,这不是 “混”,是 “拼” 啊。
|
||||
|
||||
别说传统意义的职业崇拜消失殆尽,你觉得稳如狗的岗位,说散架就散你说呢。老一辈金饭碗情结,对于下一代新创别说香,而是种被淘汰跑赢速度内心创新积极点。
|
||||
|
||||
我不是电视台员工了,早就离职 10 年了。 """
|
||||
humanizer = HeavyHumanizer()
|
||||
result = humanizer.humanize_text(text)
|
||||
print(result)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +0,0 @@
|
||||
这是一个测试文本。它包含多个句子。每个句子都很短。但是我们需要测试分段排版功能。当文本长度超过最小段落长度时。应该被分割成多个段落。这样可以提高文档的可读性。让内容更加清晰易懂。
|
||||
@ -1,35 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""测试分段排版功能"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from text_splitter import TextSplitter
|
||||
|
||||
# 测试文本
|
||||
test_text = """这是一个测试文本。它包含多个句子。每个句子都很短。但是我们需要测试分段排版功能。
|
||||
当文本长度超过最小段落长度时。应该被分割成多个段落。这样可以提高文档的可读性。
|
||||
让内容更加清晰易懂。"""
|
||||
|
||||
def test_text_splitting():
|
||||
print("=== 测试分段排版功能 ===")
|
||||
print(f"原始文本长度: {len(test_text)} 字符")
|
||||
print(f"原始文本: {test_text}")
|
||||
print()
|
||||
|
||||
# 创建分段器
|
||||
splitter = TextSplitter(min_length=50, max_length=200)
|
||||
|
||||
# 分段处理
|
||||
paragraphs = splitter.split_text(test_text)
|
||||
|
||||
print(f"分段结果 ({len(paragraphs)} 个段落):")
|
||||
for i, paragraph in enumerate(paragraphs, 1):
|
||||
print(f"段落 {i} ({len(paragraph)} 字符): {paragraph}")
|
||||
|
||||
print()
|
||||
print("=== 测试完成 ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_text_splitting()
|
||||
140
text_splitter.py
140
text_splitter.py
@ -1,140 +0,0 @@
|
||||
import re
|
||||
|
||||
class TextSplitter:
|
||||
def __init__(self, min_length=100, max_length=300):
|
||||
"""
|
||||
初始化文本分段器
|
||||
:param min_length: 目标段落最小长度
|
||||
:param max_length: 目标段落最大长度
|
||||
"""
|
||||
self.min_length = min_length
|
||||
self.max_length = max_length
|
||||
# 匹配标点符号的正则表达式,作为分段点(中文和英文标点)
|
||||
# 这些标点符号通常表示一个完整句子的结束
|
||||
self.sentence_ending_punct = re.compile(r'([。?!.!?])')
|
||||
|
||||
def split_text(self, text):
|
||||
"""
|
||||
将文本分割成符合长度要求的段落,仅使用标点符号分割
|
||||
:param text: 待分割的原始文本
|
||||
:return: 分割后的段落列表
|
||||
"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# 自动判断原始文本长度
|
||||
original_length = len(text)
|
||||
print(f"原始文本长度: {original_length} 字符")
|
||||
|
||||
# 如果原始文本长度小于最小长度,直接返回
|
||||
if original_length <= self.min_length:
|
||||
return [text.strip()]
|
||||
|
||||
# 将文本分割成完整句子(保留标点符号)
|
||||
parts = self.sentence_ending_punct.split(text)
|
||||
sentences = []
|
||||
|
||||
# 重组句子,确保标点符号与前面的文本在一起
|
||||
for i in range(0, len(parts)-1, 2):
|
||||
sentence = (parts[i] + parts[i+1]).strip()
|
||||
if sentence: # 跳过空句子
|
||||
sentences.append(sentence)
|
||||
|
||||
# 如果没有找到任何标点符号,将整个文本作为一个段落
|
||||
if not sentences:
|
||||
return [text.strip()]
|
||||
|
||||
# 合并句子形成段落,确保在长度范围内
|
||||
paragraphs = []
|
||||
current_paragraph = ""
|
||||
|
||||
for sentence in sentences:
|
||||
# 尝试添加当前句子
|
||||
temp = current_paragraph + (" " if current_paragraph else "") + sentence
|
||||
|
||||
# 检查添加后是否超出最大长度
|
||||
if len(temp) > self.max_length:
|
||||
# 如果当前段落不为空,先保存当前段落
|
||||
if current_paragraph:
|
||||
paragraphs.append(current_paragraph)
|
||||
current_paragraph = sentence
|
||||
else:
|
||||
# 如果单个句子就超过最大长度,也必须接受(避免分割句子)
|
||||
paragraphs.append(sentence)
|
||||
current_paragraph = ""
|
||||
else:
|
||||
current_paragraph = temp
|
||||
|
||||
# 添加最后一个段落
|
||||
if current_paragraph:
|
||||
paragraphs.append(current_paragraph)
|
||||
|
||||
# 检查是否有段落短于最小长度,如果有则与下一段合并
|
||||
i = 0
|
||||
while i < len(paragraphs) - 1:
|
||||
if len(paragraphs[i]) < self.min_length:
|
||||
# 合并当前段落和下一段落
|
||||
paragraphs[i] = paragraphs[i] + " " + paragraphs[i+1]
|
||||
del paragraphs[i+1]
|
||||
else:
|
||||
i += 1
|
||||
|
||||
print(f"分割后段落数量: {len(paragraphs)}")
|
||||
return paragraphs
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 示例文本
|
||||
sample_text = """
|
||||
最近,晓蕾又上热搜了!
|
||||
|
||||
咋回事呢?原来,她和老公刘剑一起开了直播带货的副业。但特意声明:她早就离开了上海电视台的编制,也不拿电视台的工资。换句话说,现在卖东西,完全是私营业态。
|
||||
|
||||
这事儿一下子引爆了大家的八卦魂。毕竟,明星主持扎堆直播间,也不算新鲜事。但还是挺多人纳闷:这些当年的 "话筒头牌",是不是集体选择摆烂了?
|
||||
|
||||
其实,晓蕾和刘剑干脆落落大方,在直播间直接回应了这点。俩人意思很明确:“我们不是来拉低职业口碑的”。而且还耐心解释了自己转行的理由。
|
||||
曾经的大佬,变成了烟火气
|
||||
|
||||
说到晓蕾,不了解点她背景都不好意思讨论人家。当年上视新闻部的 "当家花旦",光学历和气质,足够秒杀隔壁主持圈的八条街。而刘剑,早年可是 "台柱子",播音腔精致到令人耳膜怀孕。照理来说,这样一对,在编制铁饭碗里躺平一辈子没毛病。
|
||||
|
||||
可人家偏不。
|
||||
|
||||
晓蕾说过这样一句话:“其实,我就是个普通人。” 真的那么普通吗?她不这么说,没人敢忘了她的标杆履历啊!她离开台里后,居然一头扎进了童语言教育这个赛道,一干就是十年,让机构做到了业内小圈子的爆款水准。
|
||||
|
||||
而这次直播,打的商品也不混乱,主打性价比和实用属性,晓蕾每件商品还得亲测过。如果你觉得她自吹自擂,建议去看看她直播间的粉丝评论。大家的意思是:晓蕾推品 = 放心买。
|
||||
刘剑这枚 “前一哥”,更狠!
|
||||
|
||||
说晓蕾牛,别忘了,刘剑十年前也上演了一场 “豪赌”。那个年代,辞去电视台稳定工作,和 “打水漂” 差不多。
|
||||
|
||||
可是刘剑敢把梭全下,为啥?因为他看中了播音考生和辅导课程的市场,那时还没有多少人扎堆干这块,他觉得这是个机会。
|
||||
|
||||
果然,就这么辞了职,工作的腰板从跟组织吃工资,摇身变成了名副其实的事业单位 —— 自己家老板。虽然后来也是磕磕绊绊,但终究从试验田里掘出了一片肥沃地。
|
||||
主持人的 “下海”,是换方向走
|
||||
|
||||
有人觉得,曾经的新闻人、主持人 “跑去带货”,肯定是混不下去了。你要放在十年前,这种联想不稀奇,可现在不一样了。大环境变了,传统媒体是真的在互联网时代被打败得找不到调。
|
||||
|
||||
原来电视频道的观众,现在早转移到手机端,看知乎、刷短视频,甚至晚上蹲个带货直播会。你说新闻节目的高冷主播,现在换脸做带货主持,是不是 “落魄”?未必。
|
||||
|
||||
晓蕾夫妻这一波,实际上是转型很成功的范例。不管带啥网红货,他们俩把品质第一的逻辑摆明白了。这样的主播,不止卖产品,更卖信誉,靠着时间积攒了观众的信任。
|
||||
直播间哪门子 LOW?明明是主战场
|
||||
|
||||
网友说得有趣:“谁嫌直播带货 LOW,谁就输定了。” 道理没跑儿,移动互联网成了咱生活重心,生意也跟着迁移。这是明显趋势,看不懂的还真不想赚钱了。
|
||||
|
||||
而且,做直播一点不轻松。站几个小时口播、随时照顾弹幕情绪,这比坐着念提词器辛苦多了。像晓蕾和刘剑这样的 “摸鱼资历”,能转过身来赚饭钱,这不是 “混”,是 “拼” 啊。
|
||||
|
||||
别说传统意义的职业崇拜消失殆尽,你觉得稳如狗的岗位,说散架就散。老一辈金饭碗情结,对于下一代新创别说香,而是种被淘汰跑赢速度内心创新积极点。
|
||||
|
||||
我不是电视台员工了,早就离职 10 年了。"""
|
||||
|
||||
# 创建分段器实例,设置目标段落长度范围
|
||||
splitter = TextSplitter(min_length=10, max_length=20)
|
||||
|
||||
# 分割文本
|
||||
paragraphs = splitter.split_text(sample_text)
|
||||
|
||||
# 打印结果
|
||||
print("\n分割结果:")
|
||||
for i, para in enumerate(paragraphs, 1):
|
||||
|
||||
print(para)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user