From 5397e7cfc2300302df7e955f3e36ef891f5dfb53 Mon Sep 17 00:00:00 2001 From: wsb1224 Date: Mon, 26 May 2025 09:23:17 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=8E=B7=E5=8F=96=E7=BD=91?= =?UTF-8?q?=E9=A1=B5=E5=86=85=E5=AE=B9=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...chWTX.py => ArticleReplaceDifyBatchWTT.py} | 2 +- ArticleReplaceBatch/images_edit.py | 11 ++- ArticleReplaceBatch/main_process.py | 4 +- ArticleReplaceBatch/main_process_wtt.py | 95 +++++++++++-------- ArticleReplaceBatch/test.py | 21 ++-- ArticleReplaceBatch/txt2docx.py | 9 +- ArticleReplaceBatch/utils.py | 19 ++-- 7 files changed, 102 insertions(+), 59 deletions(-) rename ArticleReplaceBatch/{ArticleReplaceDifyBatchWTX.py => ArticleReplaceDifyBatchWTT.py} (99%) diff --git a/ArticleReplaceBatch/ArticleReplaceDifyBatchWTX.py b/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py similarity index 99% rename from ArticleReplaceBatch/ArticleReplaceDifyBatchWTX.py rename to ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py index 1a4ec7c..296aca2 100644 --- a/ArticleReplaceBatch/ArticleReplaceDifyBatchWTX.py +++ b/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py @@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk): # AI服务提供商选择 ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W) - self.ai_service_var = tk.StringVar(value="dify") + self.ai_service_var = tk.StringVar(value="coze") ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly") ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W) diff --git a/ArticleReplaceBatch/images_edit.py b/ArticleReplaceBatch/images_edit.py index 0e4213d..b466bbc 100644 --- a/ArticleReplaceBatch/images_edit.py +++ b/ArticleReplaceBatch/images_edit.py @@ -83,11 +83,18 @@ def download_image(image_url, save_path): print(f"请求出错:{e}") -def download_and_process_images(img_urls, article_title): +def download_and_process_images(img_urls, article_title, save_dir=None): """ 下载并处理图片 + :param img_urls: 图片URL列表 + :param article_title: 文章标题 + :param save_dir: 自定义保存目录,如果为None则使用默认目录 """ - img_dir_path = os.path.join(IMGS_BASE_PATH, article_title) + if save_dir is None: + save_dir = IMGS_BASE_PATH + + img_dir_path = os.path.join(str(save_dir), str(article_title)) + logger.info(f"图片保存路径:{img_dir_path}") safe_open_directory(img_dir_path) for i, img_url in enumerate(img_urls): diff --git a/ArticleReplaceBatch/main_process.py b/ArticleReplaceBatch/main_process.py index 3316d3f..4844b5e 100644 --- a/ArticleReplaceBatch/main_process.py +++ b/ArticleReplaceBatch/main_process.py @@ -26,6 +26,8 @@ def process_link(link, ai_service): else: title_text, article_text, img_urls = "", "", [] + print(title_text) + if title_text == "": return elif len(title_text) > 100: @@ -111,7 +113,7 @@ def process_link(link, ai_service): # finally_article = message_content.replace("正文:", "") + "\n" - file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text) + file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0] article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt") diff --git a/ArticleReplaceBatch/main_process_wtt.py b/ArticleReplaceBatch/main_process_wtt.py index 3228d48..18f275d 100644 --- a/ArticleReplaceBatch/main_process_wtt.py +++ b/ArticleReplaceBatch/main_process_wtt.py @@ -2,7 +2,7 @@ import threading import queue import json # 导入 json 模块 -from ai_studio import call_dify_workflow, call_coze_workflow +from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow from databases import * from images_edit import download_and_process_images @@ -12,7 +12,8 @@ from config import * # ==============================主程序=========================== -def process_link(link, ai_service): +def process_link(link_info, ai_service): + link, article_type = link_info # 解包链接和类型信息 """ 处理单个链接 :param link: 要处理的链接 @@ -25,6 +26,8 @@ def process_link(link, ai_service): title_text, article_text, img_urls = toutiao_extract_content(link) elif link.startswith("https://mp.weixin.qq.co"): title_text, article_text, img_urls = wechat_extract_content(link) + elif link.startswith("https://www.163.com"): + title_text, article_text, img_urls = wangyi_extract_content(link) else: title_text, article_text, img_urls = "", "", [] @@ -44,7 +47,7 @@ def process_link(link, ai_service): title = extract_content_until_punctuation(article_text).replace("正文:", "") - print(title) + print(img_urls) print(article_text) from datetime import datetime @@ -78,55 +81,61 @@ def process_link(link, ai_service): # } message_content = call_dify_workflow(input_data) elif ai_service == "coze": - logger.info("coze正在处理") - weijin = "" - if check_keywords: - weijin = "违禁" - # 从配置加载 Coze input_data 模板 - input_data_template_str = CONFIG['Coze'].get('input_data_template', - '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}') - try: - # 解析模板字符串为字典 - input_data_template = json.loads(input_data_template_str) - # 使用实际变量格式化模板 - input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in - input_data_template.items()} - except (json.JSONDecodeError, KeyError, AttributeError) as e: - logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.") - input_data = { - "article": article_text, - "link": link, - "weijin": weijin + # logger.info("coze正在处理") + # weijin = "" + # if check_keywords: + # weijin = "违禁" + # # 从配置加载 Coze input_data 模板 + # input_data_template_str = CONFIG['Coze'].get('input_data_template', + # '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}') + # try: + # # 解析模板字符串为字典 + # input_data_template = json.loads(input_data_template_str) + # # 使用实际变量格式化模板 + # input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in + # input_data_template.items()} + # except (json.JSONDecodeError, KeyError, AttributeError) as e: + # logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.") + # input_data = { + # "article": article_text, + # "link": link, + # "weijin": weijin + # } + input_data = { + "article": article_text } + message_content = call_coze_article_workflow(input_data) - msg = call_coze_workflow(input_data) - message_content = msg['article'] - result = msg['result'] - if result == "已经创作过": - return # 获取当前时间并格式化 current_time = datetime.now().strftime("%H:%M:%S") # 打印当前时间 print("当前时间:", current_time) - finally_article = message_content.replace("正文:", "") + "\n" - article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt") + file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0] - if '*' in finally_article or '#' in finally_article or "-" in finally_article: + + # 创建类型目录 + type_dir = os.path.join(ARTICLES_BASE_PATH, article_type) + safe_open_directory(type_dir) + + # 在类型目录下保存文章 + article_save_path = os.path.join(type_dir, f"{file_name}.txt") + + if '*' in message_content or '#' in message_content or "-" in message_content: # 使用正则表达式一次性替换多个字符 old_content = re.sub(r'[*#-]', '', message_content) else: # 如果不需要替换,直接使用原内容 - old_content = finally_article + old_content = message_content print("改写完成的文章:" + old_content) # 删除AI词汇 content = old_content - check_link_insert(host, user, password, database, link) + # 判断文章合规度 if text_detection(content) == "合规": @@ -141,7 +150,10 @@ def process_link(link, ai_service): logging.info('文本已经保存') if img_urls: - download_and_process_images(img_urls, title) + # 在类型目录下创建图片目录 + type_picture_dir = os.path.join(PICTURE_BASE_PATH, article_type) + safe_open_directory(type_picture_dir) + download_and_process_images(img_urls, file_name, type_picture_dir) except Exception as e: logging.error(f"处理链接 {link} 时出错: {e}") @@ -161,15 +173,16 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify" password = CONFIG['Database']['password'] database = CONFIG['Database']['database'] - for link in links: + for link_info in links: + link = link_info[0] # 获取链接 logging.info(f"总共{len(links)}个链接") - if check_link_exists(host, user, password, database, link): - logger.info(f"链接已存在: {link}") - continue - else: - filtered_links.append(link) - logger.info(f"链接不存在: {link}") - print("链接不存在,存储到过滤器中:", link) + # if check_link_exists(host, user, password, database, link): + # logger.info(f"链接已存在: {link}") + # continue + # else: + filtered_links.append(link) + # logger.info(f"链接不存在: {link}") + # print("链接不存在,存储到过滤器中:", link) if not filtered_links: logger.info("没有新链接需要处理") diff --git a/ArticleReplaceBatch/test.py b/ArticleReplaceBatch/test.py index ccd652a..5098051 100644 --- a/ArticleReplaceBatch/test.py +++ b/ArticleReplaceBatch/test.py @@ -4,17 +4,24 @@ import requests from bs4 import BeautifulSoup -from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content +from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content from utils import handle_duplicate_files_advanced - +from images_edit import download_and_process_images # title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg") -# title,article,imgs = toutiao_w_extract_content("https://www.toutiao.com/w/1830082267985932/") +# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/") # title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/") -# print(imgs) -# print(type(imgs)) +# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html") +title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=") +print(title) +print(article) +print(imgs) +print(type(imgs)) +# +# download_and_process_images(imgs,"1") -name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt") -print(name[0]) \ No newline at end of file +# +# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt") +# print(name[0]) \ No newline at end of file diff --git a/ArticleReplaceBatch/txt2docx.py b/ArticleReplaceBatch/txt2docx.py index 59b3914..1b5286d 100644 --- a/ArticleReplaceBatch/txt2docx.py +++ b/ArticleReplaceBatch/txt2docx.py @@ -271,7 +271,14 @@ def txt2docx(txt_path, image_path, keep_txt=True): else: new_text = text.replace("```markdown", "").replace("```", "") content = new_text - image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".") + # image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".") + # image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip(".")) + from pathlib import Path + from pathlib import Path + + img_path = Path(img_path) + image_folder = img_path / txt_name.replace(".txt", "").rstrip(".") + # crop_and_replace_images(image_folder) create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name) diff --git a/ArticleReplaceBatch/utils.py b/ArticleReplaceBatch/utils.py index 8ab3b47..811db15 100644 --- a/ArticleReplaceBatch/utils.py +++ b/ArticleReplaceBatch/utils.py @@ -89,14 +89,21 @@ def extract_content_until_punctuation(text, punctuations=r'[,。!?;]'): -# 读取Excel表格某一列的内容并将内容以列表的形式返回 +# 读取Excel表格链接列和类型列的内容并将内容以元组列表的形式返回 def read_excel(file_name): datas = pd.read_excel(file_name) - first_column_name = datas.columns[0] - first_colunm_data = datas[first_column_name].tolist() - print(first_colunm_data) - - return first_colunm_data + first_column_name = datas.columns[0] # 链接列 + type_column_name = '类型' # 类型列 + + links = datas[first_column_name].tolist() + # 如果存在类型列就读取,不存在则为默认类型 + types = datas[type_column_name].tolist() if type_column_name in datas.columns else ['默认'] * len(links) + + # 将链接和类型组合成元组列表 + result = list(zip(links, types)) + print(result) + + return result