更新功能:
段落控制功能,可自定义控制每个段落有多少句话
This commit is contained in:
parent
6b5e4adea6
commit
d3ac3238ed
10
README.md
10
README.md
@ -93,6 +93,7 @@ TxT2DOCX/
|
|||||||
- **编码检测** - 自动识别文件编码(UTF-8, GBK, GB2312等)
|
- **编码检测** - 自动识别文件编码(UTF-8, GBK, GB2312等)
|
||||||
- **错别字纠正** - 基于词典的智能错别字替换
|
- **错别字纠正** - 基于词典的智能错别字替换
|
||||||
- **文本清洗** - 去除多余空白和格式化字符
|
- **文本清洗** - 去除多余空白和格式化字符
|
||||||
|
- **段落控制** - 控制每段最大句子数,自动分割过长段落
|
||||||
- **Markdown解析** - 支持标题、列表、引用、代码块等
|
- **Markdown解析** - 支持标题、列表、引用、代码块等
|
||||||
|
|
||||||
### 4. 图片处理功能
|
### 4. 图片处理功能
|
||||||
@ -154,7 +155,7 @@ TxT2DOCX/
|
|||||||
|
|
||||||
支持的 Markdown 语法:
|
支持的 Markdown 语法:
|
||||||
|
|
||||||
```markdown
|
```
|
||||||
# 一级标题
|
# 一级标题
|
||||||
## 二级标题
|
## 二级标题
|
||||||
### 三级标题
|
### 三级标题
|
||||||
@ -172,7 +173,7 @@ TxT2DOCX/
|
|||||||
|
|
||||||
`行内代码`
|
`行内代码`
|
||||||
|
|
||||||
```代码块```
|
``代码块```
|
||||||
|
|
||||||

|

|
||||||
```
|
```
|
||||||
@ -242,6 +243,11 @@ TxT2DOCX/
|
|||||||
2. 在 `batch_processor.py` 中调用新功能
|
2. 在 `batch_processor.py` 中调用新功能
|
||||||
3. 可选:在 GUI 中添加相应配置选项
|
3. 可选:在 GUI 中添加相应配置选项
|
||||||
|
|
||||||
|
#### 段落句子数控制
|
||||||
|
1. 在配置界面的"文字处理"选项卡中设置"每段最大句子数"
|
||||||
|
2. 设置为0表示不限制,大于0的数值表示每段最多包含的句子数
|
||||||
|
3. 程序会自动将超过限制的段落分割成多个段落
|
||||||
|
|
||||||
#### 扩展图片处理
|
#### 扩展图片处理
|
||||||
1. 在 `image_processor.py` 中添加新的处理方法
|
1. 在 `image_processor.py` 中添加新的处理方法
|
||||||
2. 支持新的图片格式或处理效果
|
2. 支持新的图片格式或处理效果
|
||||||
|
|||||||
47
config.py
47
config.py
@ -29,6 +29,7 @@ class Config:
|
|||||||
self.reverse_text_order = False # 转换文字顺序开关
|
self.reverse_text_order = False # 转换文字顺序开关
|
||||||
self.replace_punctuation = False # 是否替换标点符号
|
self.replace_punctuation = False # 是否替换标点符号
|
||||||
self.add_disclaimer = False # 是否添加免责声明
|
self.add_disclaimer = False # 是否添加免责声明
|
||||||
|
self.max_sentences_per_paragraph = 0 # 每段最大句子数,0表示不限制
|
||||||
|
|
||||||
# 错别字处理配置
|
# 错别字处理配置
|
||||||
self.enable_char_errors = False # 是否启用错别字处理
|
self.enable_char_errors = False # 是否启用错别字处理
|
||||||
@ -88,6 +89,8 @@ class Config:
|
|||||||
self.enable_char_errors = section.getboolean('enable_char_errors', self.enable_char_errors)
|
self.enable_char_errors = section.getboolean('enable_char_errors', self.enable_char_errors)
|
||||||
self.char_error_intensity = section.getfloat('char_error_intensity', self.char_error_intensity)
|
self.char_error_intensity = section.getfloat('char_error_intensity', self.char_error_intensity)
|
||||||
self.char_error_db_path = section.get('char_error_db_path', self.char_error_db_path)
|
self.char_error_db_path = section.get('char_error_db_path', self.char_error_db_path)
|
||||||
|
# 新增段落句子数控制配置
|
||||||
|
self.max_sentences_per_paragraph = section.getint('max_sentences_per_paragraph', self.max_sentences_per_paragraph)
|
||||||
|
|
||||||
# 加载图片处理配置
|
# 加载图片处理配置
|
||||||
if 'ImageProcessing' in config_parser:
|
if 'ImageProcessing' in config_parser:
|
||||||
@ -145,7 +148,8 @@ class Config:
|
|||||||
'add_disclaimer': str(self.add_disclaimer),
|
'add_disclaimer': str(self.add_disclaimer),
|
||||||
'enable_char_errors': str(self.enable_char_errors),
|
'enable_char_errors': str(self.enable_char_errors),
|
||||||
'char_error_intensity': str(self.char_error_intensity),
|
'char_error_intensity': str(self.char_error_intensity),
|
||||||
'char_error_db_path': self.char_error_db_path
|
'char_error_db_path': self.char_error_db_path,
|
||||||
|
'max_sentences_per_paragraph': str(self.max_sentences_per_paragraph) # 新增配置项
|
||||||
}
|
}
|
||||||
|
|
||||||
# 保存图片处理配置
|
# 保存图片处理配置
|
||||||
@ -246,6 +250,8 @@ class Config:
|
|||||||
self.enable_char_errors = tp.get('enable_char_errors', self.enable_char_errors)
|
self.enable_char_errors = tp.get('enable_char_errors', self.enable_char_errors)
|
||||||
self.char_error_intensity = tp.get('char_error_intensity', self.char_error_intensity)
|
self.char_error_intensity = tp.get('char_error_intensity', self.char_error_intensity)
|
||||||
self.char_error_db_path = tp.get('char_error_db_path', self.char_error_db_path)
|
self.char_error_db_path = tp.get('char_error_db_path', self.char_error_db_path)
|
||||||
|
# 新增段落句子数控制配置
|
||||||
|
self.max_sentences_per_paragraph = tp.get('max_sentences_per_paragraph', self.max_sentences_per_paragraph)
|
||||||
|
|
||||||
# 图片处理配置
|
# 图片处理配置
|
||||||
if 'image_processing' in config_dict:
|
if 'image_processing' in config_dict:
|
||||||
@ -268,8 +274,43 @@ class Config:
|
|||||||
|
|
||||||
def reset_to_defaults(self) -> None:
|
def reset_to_defaults(self) -> None:
|
||||||
"""重置所有配置为默认值"""
|
"""重置所有配置为默认值"""
|
||||||
self.__init__()
|
# 文件处理配置
|
||||||
|
self.txt_encoding = "utf-8"
|
||||||
|
self.match_pattern = "exact" # exact: 完全匹配, prefix: 前缀匹配, contains: 包含
|
||||||
|
self.output_location = "txt_folder" # txt_folder or custom
|
||||||
|
|
||||||
|
# 最近使用的文件夹路径
|
||||||
|
self.last_txt_folder = ""
|
||||||
|
self.last_images_root = ""
|
||||||
|
self.last_output_root = ""
|
||||||
|
|
||||||
|
# 文字处理配置
|
||||||
|
self.reverse_text_order = False # 转换文字顺序开关
|
||||||
|
self.replace_punctuation = False # 是否替换标点符号
|
||||||
|
self.add_disclaimer = False # 是否添加免责声明
|
||||||
|
self.max_sentences_per_paragraph = 0 # 每段最大句子数,0表示不限制
|
||||||
|
|
||||||
|
# 错别字处理配置
|
||||||
|
self.enable_char_errors = False # 是否启用错别字处理
|
||||||
|
self.char_error_intensity = 0.3 # 错别字强度 0.0-1.0
|
||||||
|
self.char_error_db_path = "data/error_chars.json" # 错别字库路径
|
||||||
|
|
||||||
|
# 图片处理配置
|
||||||
|
self.image_sort_by = "name" # name or time
|
||||||
|
self.image_resize = "none" # none or width
|
||||||
|
self.image_width = 6 # 英寸
|
||||||
|
self.image_alignment = "center" # left, center, right
|
||||||
|
self.image_strategy = "cycle" # cycle, truncate, repeat_last
|
||||||
|
self.image_insert_position = "after_title" # before_title, after_title (有标题时)
|
||||||
|
self.image_insert_interval = 5 # 无标题时每隔几段插入一张图片
|
||||||
|
|
||||||
|
# 文档格式配置
|
||||||
|
self.line_spacing = 1.5
|
||||||
|
self.title_levels = 6 # 支持的最大标题层级
|
||||||
|
|
||||||
|
# 排版样式配置
|
||||||
|
self.current_style = "爆款文章风格" # 当前选中的样式
|
||||||
|
self.use_custom_style = False # 是否使用自定义样式
|
||||||
|
|
||||||
# 全局配置实例
|
# 全局配置实例
|
||||||
CONFIG_FILE_PATH = os.path.join(os.path.expanduser("~"), ".txt2md2docx.ini")
|
CONFIG_FILE_PATH = os.path.join(os.path.expanduser("~"), ".txt2md2docx.ini")
|
||||||
|
|||||||
@ -156,6 +156,30 @@ def _create_text_tab(parent):
|
|||||||
|
|
||||||
ttk.Separator(parent, orient='horizontal').pack(fill='x', padx=10, pady=15)
|
ttk.Separator(parent, orient='horizontal').pack(fill='x', padx=10, pady=15)
|
||||||
|
|
||||||
|
# 段落句子数控制
|
||||||
|
ttk.Label(parent, text='段落控制', font=('', 11, 'bold'), foreground='darkblue').pack(anchor='w', padx=10, pady=(0, 5))
|
||||||
|
|
||||||
|
# 每段最大句子数
|
||||||
|
sentence_frame = ttk.Frame(parent)
|
||||||
|
sentence_frame.pack(fill='x', padx=10, pady=5)
|
||||||
|
ttk.Label(sentence_frame, text='每段最大句子数:', width=15).pack(side='left')
|
||||||
|
sentence_var = tk.IntVar(value=config.max_sentences_per_paragraph)
|
||||||
|
sentence_spin = ttk.Spinbox(sentence_frame, from_=0, to=100, textvariable=sentence_var, width=10)
|
||||||
|
sentence_spin.pack(side='left', padx=(0, 10))
|
||||||
|
ttk.Label(sentence_frame, text='(0表示不限制)').pack(side='left')
|
||||||
|
|
||||||
|
def update_sentence_limit(*args):
|
||||||
|
try:
|
||||||
|
config.max_sentences_per_paragraph = sentence_var.get()
|
||||||
|
except (tk.TclError, ValueError):
|
||||||
|
# 如果输入无效,设置为默认值0
|
||||||
|
config.max_sentences_per_paragraph = 0
|
||||||
|
sentence_var.set(0)
|
||||||
|
|
||||||
|
sentence_var.trace('w', update_sentence_limit)
|
||||||
|
|
||||||
|
ttk.Separator(parent, orient='horizontal').pack(fill='x', padx=10, pady=15)
|
||||||
|
|
||||||
# 免责声明
|
# 免责声明
|
||||||
disclaimer_var = tk.BooleanVar(value=config.add_disclaimer)
|
disclaimer_var = tk.BooleanVar(value=config.add_disclaimer)
|
||||||
ttk.Checkbutton(parent, text='添加免责声明', variable=disclaimer_var).pack(anchor='w', padx=10, pady=5)
|
ttk.Checkbutton(parent, text='添加免责声明', variable=disclaimer_var).pack(anchor='w', padx=10, pady=5)
|
||||||
@ -177,7 +201,8 @@ def _create_text_tab(parent):
|
|||||||
'db_path': db_var,
|
'db_path': db_var,
|
||||||
'reverse_text': reverse_var,
|
'reverse_text': reverse_var,
|
||||||
'punctuation': punctuation_var,
|
'punctuation': punctuation_var,
|
||||||
'disclaimer': disclaimer_var
|
'disclaimer': disclaimer_var,
|
||||||
|
'max_sentences': sentence_var # 添加返回值
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -264,7 +289,8 @@ def _update_image_width(value):
|
|||||||
"""更新图片宽度"""
|
"""更新图片宽度"""
|
||||||
try:
|
try:
|
||||||
config.image_width = float(value)
|
config.image_width = float(value)
|
||||||
except:
|
except (ValueError, tk.TclError):
|
||||||
|
# 如果输入无效,保持当前值不变
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -272,7 +298,8 @@ def _update_image_interval(value):
|
|||||||
"""更新图片插入间隔"""
|
"""更新图片插入间隔"""
|
||||||
try:
|
try:
|
||||||
config.image_insert_interval = int(value)
|
config.image_insert_interval = int(value)
|
||||||
except:
|
except (ValueError, tk.TclError):
|
||||||
|
# 如果输入无效,保持当前值不变
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@ -298,6 +325,7 @@ def _reset_to_default(char_vars):
|
|||||||
config.image_strategy = default_config.image_strategy
|
config.image_strategy = default_config.image_strategy
|
||||||
config.line_spacing = default_config.line_spacing
|
config.line_spacing = default_config.line_spacing
|
||||||
config.title_levels = default_config.title_levels
|
config.title_levels = default_config.title_levels
|
||||||
|
config.max_sentences_per_paragraph = default_config.max_sentences_per_paragraph # 添加这行
|
||||||
|
|
||||||
# 更新界面变量
|
# 更新界面变量
|
||||||
if char_vars:
|
if char_vars:
|
||||||
@ -307,6 +335,7 @@ def _reset_to_default(char_vars):
|
|||||||
char_vars['reverse_text'].set(default_config.reverse_text_order)
|
char_vars['reverse_text'].set(default_config.reverse_text_order)
|
||||||
char_vars['punctuation'].set(default_config.replace_punctuation)
|
char_vars['punctuation'].set(default_config.replace_punctuation)
|
||||||
char_vars['disclaimer'].set(default_config.add_disclaimer)
|
char_vars['disclaimer'].set(default_config.add_disclaimer)
|
||||||
|
char_vars['max_sentences'].set(default_config.max_sentences_per_paragraph) # 添加这行
|
||||||
|
|
||||||
messagebox.showinfo('信息', '配置已重置为默认值')
|
messagebox.showinfo('信息', '配置已重置为默认值')
|
||||||
|
|
||||||
|
|||||||
@ -149,6 +149,10 @@ class TextProcessor:
|
|||||||
# 应用错别字处理
|
# 应用错别字处理
|
||||||
processed_text = self.apply_char_errors(processed_text)
|
processed_text = self.apply_char_errors(processed_text)
|
||||||
|
|
||||||
|
# 控制段落句子数
|
||||||
|
if config.max_sentences_per_paragraph > 0:
|
||||||
|
processed_text = self.limit_sentences_per_paragraph(processed_text, config.max_sentences_per_paragraph)
|
||||||
|
|
||||||
# 最后进行标点符号替换
|
# 最后进行标点符号替换
|
||||||
if config.replace_punctuation:
|
if config.replace_punctuation:
|
||||||
processed_text = self.replace_periods(processed_text)
|
processed_text = self.replace_periods(processed_text)
|
||||||
@ -285,6 +289,73 @@ class TextProcessor:
|
|||||||
"truncated": truncated
|
"truncated": truncated
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def limit_sentences_per_paragraph(self, text: str, max_sentences: int) -> str:
|
||||||
|
"""
|
||||||
|
控制每个段落的句子数量
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: 输入文本
|
||||||
|
max_sentences: 每段最大句子数
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: 处理后的文本
|
||||||
|
"""
|
||||||
|
if not text or max_sentences <= 0:
|
||||||
|
return text
|
||||||
|
|
||||||
|
# 定义句子结束标点符号
|
||||||
|
sentence_endings = ['。', '!', '?', '.', '!', '?']
|
||||||
|
|
||||||
|
# 按段落分割文本
|
||||||
|
paragraphs = text.split('\n')
|
||||||
|
processed_paragraphs = []
|
||||||
|
|
||||||
|
for paragraph in paragraphs:
|
||||||
|
if not paragraph.strip():
|
||||||
|
processed_paragraphs.append(paragraph)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 找到所有句子结束位置
|
||||||
|
sentences = []
|
||||||
|
current_sentence = ""
|
||||||
|
|
||||||
|
for char in paragraph:
|
||||||
|
current_sentence += char
|
||||||
|
# 如果是句子结束符号,则认为是一个完整句子
|
||||||
|
if char in sentence_endings:
|
||||||
|
sentences.append(current_sentence)
|
||||||
|
current_sentence = ""
|
||||||
|
|
||||||
|
# 添加最后一个可能没有结束符号的句子
|
||||||
|
if current_sentence.strip():
|
||||||
|
sentences.append(current_sentence)
|
||||||
|
|
||||||
|
# 如果段落句子数不超过限制,直接添加
|
||||||
|
if len(sentences) <= max_sentences:
|
||||||
|
processed_paragraphs.append(paragraph)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果超过限制,重新组织段落
|
||||||
|
new_paragraphs = []
|
||||||
|
current_new_paragraph = ""
|
||||||
|
|
||||||
|
for i, sentence in enumerate(sentences):
|
||||||
|
current_new_paragraph += sentence
|
||||||
|
|
||||||
|
# 每达到max_sentences句就换段落
|
||||||
|
if (i + 1) % max_sentences == 0:
|
||||||
|
new_paragraphs.append(current_new_paragraph.strip())
|
||||||
|
current_new_paragraph = ""
|
||||||
|
|
||||||
|
# 添加剩余的句子
|
||||||
|
if current_new_paragraph.strip():
|
||||||
|
new_paragraphs.append(current_new_paragraph.strip())
|
||||||
|
|
||||||
|
# 将新段落添加到结果中
|
||||||
|
processed_paragraphs.extend(new_paragraphs)
|
||||||
|
|
||||||
|
return '\n'.join(processed_paragraphs)
|
||||||
|
|
||||||
|
|
||||||
# 创建全局文本处理器实例
|
# 创建全局文本处理器实例
|
||||||
text_processor = TextProcessor()
|
text_processor = TextProcessor()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user