From d3ac3238ed2374a99dd554a932f64b5559714dcb Mon Sep 17 00:00:00 2001 From: wsb1224 Date: Wed, 15 Oct 2025 17:54:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=8A=9F=E8=83=BD=EF=BC=9A?= =?UTF-8?q?=20=E6=AE=B5=E8=90=BD=E6=8E=A7=E5=88=B6=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=EF=BC=8C=E5=8F=AF=E8=87=AA=E5=AE=9A=E4=B9=89=E6=8E=A7=E5=88=B6?= =?UTF-8?q?=E6=AF=8F=E4=B8=AA=E6=AE=B5=E8=90=BD=E6=9C=89=E5=A4=9A=E5=B0=91?= =?UTF-8?q?=E5=8F=A5=E8=AF=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 10 +++++-- config.py | 47 +++++++++++++++++++++++++++++-- gui_config.py | 35 +++++++++++++++++++++-- text_processor.py | 71 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4e79276..7ea1df3 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ TxT2DOCX/ - **编码检测** - 自动识别文件编码(UTF-8, GBK, GB2312等) - **错别字纠正** - 基于词典的智能错别字替换 - **文本清洗** - 去除多余空白和格式化字符 +- **段落控制** - 控制每段最大句子数,自动分割过长段落 - **Markdown解析** - 支持标题、列表、引用、代码块等 ### 4. 图片处理功能 @@ -154,7 +155,7 @@ TxT2DOCX/ 支持的 Markdown 语法: -```markdown +``` # 一级标题 ## 二级标题 ### 三级标题 @@ -172,7 +173,7 @@ TxT2DOCX/ `行内代码` -```代码块``` +``代码块``` ![图片](path/to/image.png) ``` @@ -242,6 +243,11 @@ TxT2DOCX/ 2. 在 `batch_processor.py` 中调用新功能 3. 可选:在 GUI 中添加相应配置选项 +#### 段落句子数控制 +1. 在配置界面的"文字处理"选项卡中设置"每段最大句子数" +2. 设置为0表示不限制,大于0的数值表示每段最多包含的句子数 +3. 程序会自动将超过限制的段落分割成多个段落 + #### 扩展图片处理 1. 在 `image_processor.py` 中添加新的处理方法 2. 支持新的图片格式或处理效果 diff --git a/config.py b/config.py index f6c8a8a..40f0a7b 100644 --- a/config.py +++ b/config.py @@ -29,6 +29,7 @@ class Config: self.reverse_text_order = False # 转换文字顺序开关 self.replace_punctuation = False # 是否替换标点符号 self.add_disclaimer = False # 是否添加免责声明 + self.max_sentences_per_paragraph = 0 # 每段最大句子数,0表示不限制 # 错别字处理配置 self.enable_char_errors = False # 是否启用错别字处理 @@ -88,6 +89,8 @@ class Config: self.enable_char_errors = section.getboolean('enable_char_errors', self.enable_char_errors) self.char_error_intensity = section.getfloat('char_error_intensity', self.char_error_intensity) self.char_error_db_path = section.get('char_error_db_path', self.char_error_db_path) + # 新增段落句子数控制配置 + self.max_sentences_per_paragraph = section.getint('max_sentences_per_paragraph', self.max_sentences_per_paragraph) # 加载图片处理配置 if 'ImageProcessing' in config_parser: @@ -145,7 +148,8 @@ class Config: 'add_disclaimer': str(self.add_disclaimer), 'enable_char_errors': str(self.enable_char_errors), 'char_error_intensity': str(self.char_error_intensity), - 'char_error_db_path': self.char_error_db_path + 'char_error_db_path': self.char_error_db_path, + 'max_sentences_per_paragraph': str(self.max_sentences_per_paragraph) # 新增配置项 } # 保存图片处理配置 @@ -246,6 +250,8 @@ class Config: self.enable_char_errors = tp.get('enable_char_errors', self.enable_char_errors) self.char_error_intensity = tp.get('char_error_intensity', self.char_error_intensity) self.char_error_db_path = tp.get('char_error_db_path', self.char_error_db_path) + # 新增段落句子数控制配置 + self.max_sentences_per_paragraph = tp.get('max_sentences_per_paragraph', self.max_sentences_per_paragraph) # 图片处理配置 if 'image_processing' in config_dict: @@ -268,8 +274,43 @@ class Config: def reset_to_defaults(self) -> None: """重置所有配置为默认值""" - self.__init__() - + # 文件处理配置 + self.txt_encoding = "utf-8" + self.match_pattern = "exact" # exact: 完全匹配, prefix: 前缀匹配, contains: 包含 + self.output_location = "txt_folder" # txt_folder or custom + + # 最近使用的文件夹路径 + self.last_txt_folder = "" + self.last_images_root = "" + self.last_output_root = "" + + # 文字处理配置 + self.reverse_text_order = False # 转换文字顺序开关 + self.replace_punctuation = False # 是否替换标点符号 + self.add_disclaimer = False # 是否添加免责声明 + self.max_sentences_per_paragraph = 0 # 每段最大句子数,0表示不限制 + + # 错别字处理配置 + self.enable_char_errors = False # 是否启用错别字处理 + self.char_error_intensity = 0.3 # 错别字强度 0.0-1.0 + self.char_error_db_path = "data/error_chars.json" # 错别字库路径 + + # 图片处理配置 + self.image_sort_by = "name" # name or time + self.image_resize = "none" # none or width + self.image_width = 6 # 英寸 + self.image_alignment = "center" # left, center, right + self.image_strategy = "cycle" # cycle, truncate, repeat_last + self.image_insert_position = "after_title" # before_title, after_title (有标题时) + self.image_insert_interval = 5 # 无标题时每隔几段插入一张图片 + + # 文档格式配置 + self.line_spacing = 1.5 + self.title_levels = 6 # 支持的最大标题层级 + + # 排版样式配置 + self.current_style = "爆款文章风格" # 当前选中的样式 + self.use_custom_style = False # 是否使用自定义样式 # 全局配置实例 CONFIG_FILE_PATH = os.path.join(os.path.expanduser("~"), ".txt2md2docx.ini") diff --git a/gui_config.py b/gui_config.py index 5f24bb9..9922cf4 100644 --- a/gui_config.py +++ b/gui_config.py @@ -156,6 +156,30 @@ def _create_text_tab(parent): ttk.Separator(parent, orient='horizontal').pack(fill='x', padx=10, pady=15) + # 段落句子数控制 + ttk.Label(parent, text='段落控制', font=('', 11, 'bold'), foreground='darkblue').pack(anchor='w', padx=10, pady=(0, 5)) + + # 每段最大句子数 + sentence_frame = ttk.Frame(parent) + sentence_frame.pack(fill='x', padx=10, pady=5) + ttk.Label(sentence_frame, text='每段最大句子数:', width=15).pack(side='left') + sentence_var = tk.IntVar(value=config.max_sentences_per_paragraph) + sentence_spin = ttk.Spinbox(sentence_frame, from_=0, to=100, textvariable=sentence_var, width=10) + sentence_spin.pack(side='left', padx=(0, 10)) + ttk.Label(sentence_frame, text='(0表示不限制)').pack(side='left') + + def update_sentence_limit(*args): + try: + config.max_sentences_per_paragraph = sentence_var.get() + except (tk.TclError, ValueError): + # 如果输入无效,设置为默认值0 + config.max_sentences_per_paragraph = 0 + sentence_var.set(0) + + sentence_var.trace('w', update_sentence_limit) + + ttk.Separator(parent, orient='horizontal').pack(fill='x', padx=10, pady=15) + # 免责声明 disclaimer_var = tk.BooleanVar(value=config.add_disclaimer) ttk.Checkbutton(parent, text='添加免责声明', variable=disclaimer_var).pack(anchor='w', padx=10, pady=5) @@ -177,7 +201,8 @@ def _create_text_tab(parent): 'db_path': db_var, 'reverse_text': reverse_var, 'punctuation': punctuation_var, - 'disclaimer': disclaimer_var + 'disclaimer': disclaimer_var, + 'max_sentences': sentence_var # 添加返回值 } @@ -264,7 +289,8 @@ def _update_image_width(value): """更新图片宽度""" try: config.image_width = float(value) - except: + except (ValueError, tk.TclError): + # 如果输入无效,保持当前值不变 pass @@ -272,7 +298,8 @@ def _update_image_interval(value): """更新图片插入间隔""" try: config.image_insert_interval = int(value) - except: + except (ValueError, tk.TclError): + # 如果输入无效,保持当前值不变 pass @@ -298,6 +325,7 @@ def _reset_to_default(char_vars): config.image_strategy = default_config.image_strategy config.line_spacing = default_config.line_spacing config.title_levels = default_config.title_levels + config.max_sentences_per_paragraph = default_config.max_sentences_per_paragraph # 添加这行 # 更新界面变量 if char_vars: @@ -307,6 +335,7 @@ def _reset_to_default(char_vars): char_vars['reverse_text'].set(default_config.reverse_text_order) char_vars['punctuation'].set(default_config.replace_punctuation) char_vars['disclaimer'].set(default_config.add_disclaimer) + char_vars['max_sentences'].set(default_config.max_sentences_per_paragraph) # 添加这行 messagebox.showinfo('信息', '配置已重置为默认值') diff --git a/text_processor.py b/text_processor.py index eb2620a..afd9640 100644 --- a/text_processor.py +++ b/text_processor.py @@ -149,6 +149,10 @@ class TextProcessor: # 应用错别字处理 processed_text = self.apply_char_errors(processed_text) + # 控制段落句子数 + if config.max_sentences_per_paragraph > 0: + processed_text = self.limit_sentences_per_paragraph(processed_text, config.max_sentences_per_paragraph) + # 最后进行标点符号替换 if config.replace_punctuation: processed_text = self.replace_periods(processed_text) @@ -285,6 +289,73 @@ class TextProcessor: "truncated": truncated } + def limit_sentences_per_paragraph(self, text: str, max_sentences: int) -> str: + """ + 控制每个段落的句子数量 + + Args: + text: 输入文本 + max_sentences: 每段最大句子数 + + Returns: + str: 处理后的文本 + """ + if not text or max_sentences <= 0: + return text + + # 定义句子结束标点符号 + sentence_endings = ['。', '!', '?', '.', '!', '?'] + + # 按段落分割文本 + paragraphs = text.split('\n') + processed_paragraphs = [] + + for paragraph in paragraphs: + if not paragraph.strip(): + processed_paragraphs.append(paragraph) + continue + + # 找到所有句子结束位置 + sentences = [] + current_sentence = "" + + for char in paragraph: + current_sentence += char + # 如果是句子结束符号,则认为是一个完整句子 + if char in sentence_endings: + sentences.append(current_sentence) + current_sentence = "" + + # 添加最后一个可能没有结束符号的句子 + if current_sentence.strip(): + sentences.append(current_sentence) + + # 如果段落句子数不超过限制,直接添加 + if len(sentences) <= max_sentences: + processed_paragraphs.append(paragraph) + continue + + # 如果超过限制,重新组织段落 + new_paragraphs = [] + current_new_paragraph = "" + + for i, sentence in enumerate(sentences): + current_new_paragraph += sentence + + # 每达到max_sentences句就换段落 + if (i + 1) % max_sentences == 0: + new_paragraphs.append(current_new_paragraph.strip()) + current_new_paragraph = "" + + # 添加剩余的句子 + if current_new_paragraph.strip(): + new_paragraphs.append(current_new_paragraph.strip()) + + # 将新段落添加到结果中 + processed_paragraphs.extend(new_paragraphs) + + return '\n'.join(processed_paragraphs) + # 创建全局文本处理器实例 text_processor = TextProcessor()