From 666494c0c2a33efad87e94c4a0bbd633cd2a2f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=AA=E4=B8=80?= <2339117167@qq.com> Date: Mon, 12 May 2025 14:56:51 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=8E=B7=E5=8F=96=E7=BD=91?= =?UTF-8?q?=E9=A1=B5=E5=86=85=E5=AE=B9=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...chWTX.py => ArticleReplaceDifyBatchWTT.py} | 2 +- ArticleReplaceBatch/images_edit.py | 5 +- ArticleReplaceBatch/main_process.py | 4 +- ArticleReplaceBatch/main_process_wtt.py | 81 ++++++++++--------- ArticleReplaceBatch/test.py | 21 +++-- ArticleReplaceBatch/txt2docx.py | 19 ++++- 6 files changed, 80 insertions(+), 52 deletions(-) rename ArticleReplaceBatch/{ArticleReplaceDifyBatchWTX.py => ArticleReplaceDifyBatchWTT.py} (99%) diff --git a/ArticleReplaceBatch/ArticleReplaceDifyBatchWTX.py b/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py similarity index 99% rename from ArticleReplaceBatch/ArticleReplaceDifyBatchWTX.py rename to ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py index 1a4ec7c..296aca2 100644 --- a/ArticleReplaceBatch/ArticleReplaceDifyBatchWTX.py +++ b/ArticleReplaceBatch/ArticleReplaceDifyBatchWTT.py @@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk): # AI服务提供商选择 ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W) - self.ai_service_var = tk.StringVar(value="dify") + self.ai_service_var = tk.StringVar(value="coze") ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly") ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W) diff --git a/ArticleReplaceBatch/images_edit.py b/ArticleReplaceBatch/images_edit.py index 0e4213d..d4b99d4 100644 --- a/ArticleReplaceBatch/images_edit.py +++ b/ArticleReplaceBatch/images_edit.py @@ -87,7 +87,10 @@ def download_and_process_images(img_urls, article_title): """ 下载并处理图片 """ - img_dir_path = os.path.join(IMGS_BASE_PATH, article_title) + logger.info(IMGS_BASE_PATH) + img_dir_path = os.path.join(str(IMGS_BASE_PATH), str(article_title)) + # img_dir_path = IMGS_BASE_PATH + "/" + article_title + logger.info(img_dir_path) safe_open_directory(img_dir_path) for i, img_url in enumerate(img_urls): diff --git a/ArticleReplaceBatch/main_process.py b/ArticleReplaceBatch/main_process.py index 3316d3f..4844b5e 100644 --- a/ArticleReplaceBatch/main_process.py +++ b/ArticleReplaceBatch/main_process.py @@ -26,6 +26,8 @@ def process_link(link, ai_service): else: title_text, article_text, img_urls = "", "", [] + print(title_text) + if title_text == "": return elif len(title_text) > 100: @@ -111,7 +113,7 @@ def process_link(link, ai_service): # finally_article = message_content.replace("正文:", "") + "\n" - file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text) + file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0] article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt") diff --git a/ArticleReplaceBatch/main_process_wtt.py b/ArticleReplaceBatch/main_process_wtt.py index 3228d48..ec51747 100644 --- a/ArticleReplaceBatch/main_process_wtt.py +++ b/ArticleReplaceBatch/main_process_wtt.py @@ -2,7 +2,7 @@ import threading import queue import json # 导入 json 模块 -from ai_studio import call_dify_workflow, call_coze_workflow +from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow from databases import * from images_edit import download_and_process_images @@ -25,6 +25,8 @@ def process_link(link, ai_service): title_text, article_text, img_urls = toutiao_extract_content(link) elif link.startswith("https://mp.weixin.qq.co"): title_text, article_text, img_urls = wechat_extract_content(link) + elif link.startswith("https://www.163.com"): + title_text, article_text, img_urls = wangyi_extract_content(link) else: title_text, article_text, img_urls = "", "", [] @@ -44,7 +46,7 @@ def process_link(link, ai_service): title = extract_content_until_punctuation(article_text).replace("正文:", "") - print(title) + print(img_urls) print(article_text) from datetime import datetime @@ -78,55 +80,56 @@ def process_link(link, ai_service): # } message_content = call_dify_workflow(input_data) elif ai_service == "coze": - logger.info("coze正在处理") - weijin = "" - if check_keywords: - weijin = "违禁" - # 从配置加载 Coze input_data 模板 - input_data_template_str = CONFIG['Coze'].get('input_data_template', - '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}') - try: - # 解析模板字符串为字典 - input_data_template = json.loads(input_data_template_str) - # 使用实际变量格式化模板 - input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in - input_data_template.items()} - except (json.JSONDecodeError, KeyError, AttributeError) as e: - logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.") - input_data = { - "article": article_text, - "link": link, - "weijin": weijin + # logger.info("coze正在处理") + # weijin = "" + # if check_keywords: + # weijin = "违禁" + # # 从配置加载 Coze input_data 模板 + # input_data_template_str = CONFIG['Coze'].get('input_data_template', + # '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}') + # try: + # # 解析模板字符串为字典 + # input_data_template = json.loads(input_data_template_str) + # # 使用实际变量格式化模板 + # input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in + # input_data_template.items()} + # except (json.JSONDecodeError, KeyError, AttributeError) as e: + # logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.") + # input_data = { + # "article": article_text, + # "link": link, + # "weijin": weijin + # } + input_data = { + "article": article_text } + message_content = call_coze_article_workflow(input_data) - msg = call_coze_workflow(input_data) - message_content = msg['article'] - result = msg['result'] - if result == "已经创作过": - return # 获取当前时间并格式化 current_time = datetime.now().strftime("%H:%M:%S") # 打印当前时间 print("当前时间:", current_time) - finally_article = message_content.replace("正文:", "") + "\n" - article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt") + file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0] - if '*' in finally_article or '#' in finally_article or "-" in finally_article: + + article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt") + + if '*' in message_content or '#' in message_content or "-" in message_content: # 使用正则表达式一次性替换多个字符 old_content = re.sub(r'[*#-]', '', message_content) else: # 如果不需要替换,直接使用原内容 - old_content = finally_article + old_content = message_content print("改写完成的文章:" + old_content) # 删除AI词汇 content = old_content - check_link_insert(host, user, password, database, link) + # 判断文章合规度 if text_detection(content) == "合规": @@ -141,7 +144,7 @@ def process_link(link, ai_service): logging.info('文本已经保存') if img_urls: - download_and_process_images(img_urls, title) + download_and_process_images(img_urls, file_name) except Exception as e: logging.error(f"处理链接 {link} 时出错: {e}") @@ -163,13 +166,13 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify" for link in links: logging.info(f"总共{len(links)}个链接") - if check_link_exists(host, user, password, database, link): - logger.info(f"链接已存在: {link}") - continue - else: - filtered_links.append(link) - logger.info(f"链接不存在: {link}") - print("链接不存在,存储到过滤器中:", link) + # if check_link_exists(host, user, password, database, link): + # logger.info(f"链接已存在: {link}") + # continue + # else: + filtered_links.append(link) + # logger.info(f"链接不存在: {link}") + # print("链接不存在,存储到过滤器中:", link) if not filtered_links: logger.info("没有新链接需要处理") diff --git a/ArticleReplaceBatch/test.py b/ArticleReplaceBatch/test.py index ccd652a..5098051 100644 --- a/ArticleReplaceBatch/test.py +++ b/ArticleReplaceBatch/test.py @@ -4,17 +4,24 @@ import requests from bs4 import BeautifulSoup -from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content +from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content from utils import handle_duplicate_files_advanced - +from images_edit import download_and_process_images # title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg") -# title,article,imgs = toutiao_w_extract_content("https://www.toutiao.com/w/1830082267985932/") +# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/") # title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/") -# print(imgs) -# print(type(imgs)) +# title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html") +title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=") +print(title) +print(article) +print(imgs) +print(type(imgs)) +# +# download_and_process_images(imgs,"1") -name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt") -print(name[0]) \ No newline at end of file +# +# name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt") +# print(name[0]) \ No newline at end of file diff --git a/ArticleReplaceBatch/txt2docx.py b/ArticleReplaceBatch/txt2docx.py index 59b3914..2750323 100644 --- a/ArticleReplaceBatch/txt2docx.py +++ b/ArticleReplaceBatch/txt2docx.py @@ -145,9 +145,15 @@ def insert_images_into_paragraphs(paragraphs, image_folder, doc, title): :return: """ + if os.path.exists(image_folder): + images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if + img.lower().endswith(('jpg'))]) + else: + images = [] + # 获取图片列表并排序 - images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if - img.lower().endswith(('jpg'))]) + # images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if + # img.lower().endswith(('jpg'))]) # images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if # # img.lower().endswith(('png', 'jpg', 'jpeg'))]) @@ -271,7 +277,14 @@ def txt2docx(txt_path, image_path, keep_txt=True): else: new_text = text.replace("```markdown", "").replace("```", "") content = new_text - image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".") + # image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".") + # image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip(".")) + from pathlib import Path + from pathlib import Path + + img_path = Path(img_path) + image_folder = img_path / txt_name.replace(".txt", "").rstrip(".") + # crop_and_replace_images(image_folder) create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)