修改获取网页内容代码

This commit is contained in:
太一 2025-05-12 14:56:51 +08:00
parent 113c97c887
commit 666494c0c2
6 changed files with 80 additions and 52 deletions

View File

@ -78,7 +78,7 @@ class ArticleReplaceApp(tk.Tk):
# AI服务提供商选择 # AI服务提供商选择
ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W) ttk.Label(control_frame, text="工作流选择:").grid(row=2, column=0, padx=5, pady=5, sticky=tk.W)
self.ai_service_var = tk.StringVar(value="dify") self.ai_service_var = tk.StringVar(value="coze")
ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly") ai_service_combo = ttk.Combobox(control_frame, textvariable=self.ai_service_var, values=["dify", "coze"], width=10, state="readonly")
ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W) ai_service_combo.grid(row=2, column=1, padx=5, pady=5, sticky=tk.W)

View File

@ -87,7 +87,10 @@ def download_and_process_images(img_urls, article_title):
""" """
下载并处理图片 下载并处理图片
""" """
img_dir_path = os.path.join(IMGS_BASE_PATH, article_title) logger.info(IMGS_BASE_PATH)
img_dir_path = os.path.join(str(IMGS_BASE_PATH), str(article_title))
# img_dir_path = IMGS_BASE_PATH + "/" + article_title
logger.info(img_dir_path)
safe_open_directory(img_dir_path) safe_open_directory(img_dir_path)
for i, img_url in enumerate(img_urls): for i, img_url in enumerate(img_urls):

View File

@ -26,6 +26,8 @@ def process_link(link, ai_service):
else: else:
title_text, article_text, img_urls = "", "", [] title_text, article_text, img_urls = "", "", []
print(title_text)
if title_text == "": if title_text == "":
return return
elif len(title_text) > 100: elif len(title_text) > 100:
@ -111,7 +113,7 @@ def process_link(link, ai_service):
# finally_article = message_content.replace("正文:", "") + "\n" # finally_article = message_content.replace("正文:", "") + "\n"
file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text) file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt") article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")

View File

@ -2,7 +2,7 @@ import threading
import queue import queue
import json # 导入 json 模块 import json # 导入 json 模块
from ai_studio import call_dify_workflow, call_coze_workflow from ai_studio import call_dify_workflow, call_coze_workflow,call_coze_article_workflow
from databases import * from databases import *
from images_edit import download_and_process_images from images_edit import download_and_process_images
@ -25,6 +25,8 @@ def process_link(link, ai_service):
title_text, article_text, img_urls = toutiao_extract_content(link) title_text, article_text, img_urls = toutiao_extract_content(link)
elif link.startswith("https://mp.weixin.qq.co"): elif link.startswith("https://mp.weixin.qq.co"):
title_text, article_text, img_urls = wechat_extract_content(link) title_text, article_text, img_urls = wechat_extract_content(link)
elif link.startswith("https://www.163.com"):
title_text, article_text, img_urls = wangyi_extract_content(link)
else: else:
title_text, article_text, img_urls = "", "", [] title_text, article_text, img_urls = "", "", []
@ -44,7 +46,7 @@ def process_link(link, ai_service):
title = extract_content_until_punctuation(article_text).replace("正文:", "") title = extract_content_until_punctuation(article_text).replace("正文:", "")
print(title) print(img_urls)
print(article_text) print(article_text)
from datetime import datetime from datetime import datetime
@ -78,55 +80,56 @@ def process_link(link, ai_service):
# } # }
message_content = call_dify_workflow(input_data) message_content = call_dify_workflow(input_data)
elif ai_service == "coze": elif ai_service == "coze":
logger.info("coze正在处理") # logger.info("coze正在处理")
weijin = "" # weijin = ""
if check_keywords: # if check_keywords:
weijin = "违禁" # weijin = "违禁"
# 从配置加载 Coze input_data 模板 # # 从配置加载 Coze input_data 模板
input_data_template_str = CONFIG['Coze'].get('input_data_template', # input_data_template_str = CONFIG['Coze'].get('input_data_template',
'{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}') # '{{"article": "{article_text}", "link":"{link}", "weijin":"{weijin}"}}')
try: # try:
# 解析模板字符串为字典 # # 解析模板字符串为字典
input_data_template = json.loads(input_data_template_str) # input_data_template = json.loads(input_data_template_str)
# 使用实际变量格式化模板 # # 使用实际变量格式化模板
input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in # input_data = {k: v.format(article_text=article_text, link=link, weijin=weijin) for k, v in
input_data_template.items()} # input_data_template.items()}
except (json.JSONDecodeError, KeyError, AttributeError) as e: # except (json.JSONDecodeError, KeyError, AttributeError) as e:
logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.") # logger.error(f"处理 Coze input_data 模板时出错: {e}. 使用默认模板.")
# input_data = {
# "article": article_text,
# "link": link,
# "weijin": weijin
# }
input_data = { input_data = {
"article": article_text, "article": article_text
"link": link,
"weijin": weijin
} }
message_content = call_coze_article_workflow(input_data)
msg = call_coze_workflow(input_data)
message_content = msg['article']
result = msg['result']
if result == "已经创作过":
return
# 获取当前时间并格式化 # 获取当前时间并格式化
current_time = datetime.now().strftime("%H:%M:%S") current_time = datetime.now().strftime("%H:%M:%S")
# 打印当前时间 # 打印当前时间
print("当前时间:", current_time) print("当前时间:", current_time)
finally_article = message_content.replace("正文:", "") + "\n"
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{title}.txt") file_name = handle_duplicate_files_advanced(ARTICLES_BASE_PATH,title_text)[0]
if '*' in finally_article or '#' in finally_article or "-" in finally_article:
article_save_path = os.path.join(ARTICLES_BASE_PATH, f"{file_name}.txt")
if '*' in message_content or '#' in message_content or "-" in message_content:
# 使用正则表达式一次性替换多个字符 # 使用正则表达式一次性替换多个字符
old_content = re.sub(r'[*#-]', '', message_content) old_content = re.sub(r'[*#-]', '', message_content)
else: else:
# 如果不需要替换,直接使用原内容 # 如果不需要替换,直接使用原内容
old_content = finally_article old_content = message_content
print("改写完成的文章:" + old_content) print("改写完成的文章:" + old_content)
# 删除AI词汇 # 删除AI词汇
content = old_content content = old_content
check_link_insert(host, user, password, database, link)
# 判断文章合规度 # 判断文章合规度
if text_detection(content) == "合规": if text_detection(content) == "合规":
@ -141,7 +144,7 @@ def process_link(link, ai_service):
logging.info('文本已经保存') logging.info('文本已经保存')
if img_urls: if img_urls:
download_and_process_images(img_urls, title) download_and_process_images(img_urls, file_name)
except Exception as e: except Exception as e:
logging.error(f"处理链接 {link} 时出错: {e}") logging.error(f"处理链接 {link} 时出错: {e}")
@ -163,13 +166,13 @@ def link_to_text(prompt1=None, prompt2=None, num_threads=None, ai_service="dify"
for link in links: for link in links:
logging.info(f"总共{len(links)}个链接") logging.info(f"总共{len(links)}个链接")
if check_link_exists(host, user, password, database, link): # if check_link_exists(host, user, password, database, link):
logger.info(f"链接已存在: {link}") # logger.info(f"链接已存在: {link}")
continue # continue
else: # else:
filtered_links.append(link) filtered_links.append(link)
logger.info(f"链接不存在: {link}") # logger.info(f"链接不存在: {link}")
print("链接不存在,存储到过滤器中:", link) # print("链接不存在,存储到过滤器中:", link)
if not filtered_links: if not filtered_links:
logger.info("没有新链接需要处理") logger.info("没有新链接需要处理")

View File

@ -4,17 +4,24 @@ import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content from get_web_content import wechat_extract_content, toutiao_w_extract_content, toutiao_extract_content,wangyi_extract_content,souhu_extract_content
from utils import handle_duplicate_files_advanced from utils import handle_duplicate_files_advanced
from images_edit import download_and_process_images
# title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg") # title,article,imgs = wechat_extract_content("https://mp.weixin.qq.com/s/3KejJOMuY2y6LA5k1tNwcg")
# title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/") # title,article,imgs = toutiao_w_extract_content("https://www.t outiao.com/w/1830082267985932/")
# title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/") # title,article,imgs = toutiao_extract_content("https://www.toutiao.com/article/7496132108239356479/")
# print(imgs) # title,article,imgs = wangyi_extract_content("https://www.163.com/dy/article/JV4K9D020553VRO2.html")
# print(type(imgs)) title,article,imgs = souhu_extract_content("https://www.sohu.com/a/893588175_115479?scm=")
print(title)
print(article)
print(imgs)
print(type(imgs))
#
# download_and_process_images(imgs,"1")
name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt") #
print(name[0]) # name = handle_duplicate_files_advanced(r"F:\work\code\python\ArticleReplaceBatch\articles","exeample.txt")
# print(name[0])

View File

@ -145,9 +145,15 @@ def insert_images_into_paragraphs(paragraphs, image_folder, doc, title):
:return: :return:
""" """
# 获取图片列表并排序 if os.path.exists(image_folder):
images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
img.lower().endswith(('jpg'))]) img.lower().endswith(('jpg'))])
else:
images = []
# 获取图片列表并排序
# images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
# img.lower().endswith(('jpg'))])
# images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if # images = sorted([os.path.join(image_folder, img) for img in os.listdir(image_folder) if
# # img.lower().endswith(('png', 'jpg', 'jpeg'))]) # # img.lower().endswith(('png', 'jpg', 'jpeg'))])
@ -271,7 +277,14 @@ def txt2docx(txt_path, image_path, keep_txt=True):
else: else:
new_text = text.replace("```markdown", "").replace("```", "") new_text = text.replace("```markdown", "").replace("```", "")
content = new_text content = new_text
image_folder = img_path + '\\' + txt_name.replace(".txt", "").rstrip(".") # image_folder = img_path + r'\\' + txt_name.replace(".txt", "").rstrip(".")
# image_folder = os.path.join(img_path, txt_name.replace(".txt", "").rstrip("."))
from pathlib import Path
from pathlib import Path
img_path = Path(img_path)
image_folder = img_path / txt_name.replace(".txt", "").rstrip(".")
# crop_and_replace_images(image_folder) # crop_and_replace_images(image_folder)
create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name) create_word_document(content, image_folder, txt.replace(".txt", ".docx"), title_name)